From b076f6640e3c2781410588f4a8e4ccfeed8eb606 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Tue, 3 Sep 2024 16:45:28 -0700 Subject: [PATCH 001/425] [lldb] Remove limit on max memory read size (#105765) `memory read` will return an error if you try to read more than 1k bytes in a single command, instructing you to set `target.max-memory-read-size` or use `--force` if you intended to read more than that. This is a safeguard for a command where people are being explicit about how much memory they would like lldb to read (either to display, or save to a file) and is an annoyance every time you need to read more than a small amount. If someone confuses the --count argument with the start address, lldb may begin dumping gigabytes of data but I'd rather that behavior than requiring everyone to special-case their way around a common use case. I don't want to remove the setting because many people have added (much larger) default max read sizes to their ~/.lldbinit files after hitting this behavior. Another option would be to stop reading/using the value in Target.cpp, but I see no harm in leaving the setting if someone really does prefer to have a small cap on their memory read size. --- lldb/source/Target/TargetProperties.td | 2 +- .../memory/big-read/TestMemoryReadMaximumSize.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td index 7bb5bd53688b1..0f68deb543f90 100644 --- a/lldb/source/Target/TargetProperties.td +++ b/lldb/source/Target/TargetProperties.td @@ -102,7 +102,7 @@ let Definition = "target" in { DefaultUnsignedValue<1024>, Desc<"Maximum number of characters to show when using %s in summary strings.">; def MaxMemReadSize: Property<"max-memory-read-size", "UInt64">, - DefaultUnsignedValue<1024>, + DefaultUnsignedValue<0xffffffff>, Desc<"Maximum number of bytes that 'memory read' will fetch before --force must be specified.">; def BreakpointUseAvoidList: Property<"breakpoints-use-platform-avoid-list", "Boolean">, DefaultTrue, diff --git a/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py index 259fde71a6362..1bc227dfce9be 100644 --- a/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py +++ b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py @@ -22,6 +22,8 @@ def test_memory_read_max_setting(self): ) self.assertTrue(self.bp.IsValid()) + self.runCmd("settings set target.max-memory-read-size 1024") + self.expect( "mem rea -f x -s 4 -c 2048 `&c`", error=True, From 3e8840ba71bfcceeb598c2ca28d2d8784e24ba1e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 3 Sep 2024 16:49:42 -0700 Subject: [PATCH 002/425] Remove "Target" from createXReduction naming [nfc] Despite the stale comments, none of these actually use TTI, and they're solely generating standard LLVM IR. --- llvm/include/llvm/IR/VectorBuilder.h | 6 ++-- .../include/llvm/Transforms/Utils/LoopUtils.h | 30 ++++++++----------- llvm/lib/IR/VectorBuilder.cpp | 8 ++--- llvm/lib/Transforms/Utils/LoopUtils.cpp | 28 ++++++++--------- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++-- 6 files changed, 38 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h index dbb9f4c7336d5..b0277c2b52595 100644 --- a/llvm/include/llvm/IR/VectorBuilder.h +++ b/llvm/include/llvm/IR/VectorBuilder.h @@ -103,9 +103,9 @@ class VectorBuilder { /// \param ValTy The type of operand which the reduction operation is /// performed. /// \param VecOpArray The operand list. - Value *createSimpleTargetReduction(Intrinsic::ID RdxID, Type *ValTy, - ArrayRef VecOpArray, - const Twine &Name = Twine()); + Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); }; } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 56880bd4822c7..ba8af4aa2b0cd 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -393,32 +393,28 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op, TargetTransformInfo::ReductionShuffle RS, RecurKind MinMaxKind = RecurKind::None); -/// Create a target reduction of the given vector. The reduction operation +/// Create a reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require /// additional information supplied in \p RdxKind. -/// The target is queried to determine if intrinsics or shuffle sequences are -/// required to implement the reduction. /// Fast-math-flags are propagated using the IRBuilder's setting. -Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src, - RecurKind RdxKind); -/// Overloaded function to generate vector-predication intrinsics for target +Value *createSimpleReduction(IRBuilderBase &B, Value *Src, + RecurKind RdxKind); +/// Overloaded function to generate vector-predication intrinsics for /// reduction. -Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src, - const RecurrenceDescriptor &Desc); +Value *createSimpleReduction(VectorBuilder &VB, Value *Src, + const RecurrenceDescriptor &Desc); -/// Create a target reduction of the given vector \p Src for a reduction of the +/// Create a reduction of the given vector \p Src for a reduction of the /// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is /// described by \p Desc. -Value *createAnyOfTargetReduction(IRBuilderBase &B, Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi); +Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi); -/// Create a generic target reduction using a recurrence descriptor \p Desc -/// The target is queried to determine if intrinsics or shuffle sequences are -/// required to implement the reduction. +/// Create a generic reduction using a recurrence descriptor \p Desc /// Fast-math-flags are propagated using the RecurrenceDescriptor. -Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, PHINode *OrigPhi = nullptr); +Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, + Value *Src, PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index b8f56a7a2e5f9..f42948ba89042 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -60,10 +60,10 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy, return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name); } -Value *VectorBuilder::createSimpleTargetReduction(Intrinsic::ID RdxID, - Type *ValTy, - ArrayRef InstOpArray, - const Twine &Name) { +Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID, + Type *ValTy, + ArrayRef InstOpArray, + const Twine &Name) { auto VPID = VPIntrinsic::getForIntrinsic(RdxID); assert(VPReductionIntrinsic::isVPReduction(VPID) && "No VPIntrinsic for this reduction"); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 61f7b23693c7e..559129442a041 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1172,9 +1172,9 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } -Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src, - const RecurrenceDescriptor &Desc, - PHINode *OrigPhi) { +Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, + const RecurrenceDescriptor &Desc, + PHINode *OrigPhi) { assert( RecurrenceDescriptor::isAnyOfRecurrenceKind(Desc.getRecurrenceKind()) && "Unexpected reduction kind"); @@ -1207,8 +1207,8 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select"); } -Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, - RecurKind RdxKind) { +Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, + RecurKind RdxKind) { auto *SrcVecEltTy = cast(Src->getType())->getElementType(); auto getIdentity = [&]() { Intrinsic::ID ID = getReductionIntrinsicID(RdxKind); @@ -1241,8 +1241,8 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, } } -Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src, - const RecurrenceDescriptor &Desc) { +Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, + const RecurrenceDescriptor &Desc) { RecurKind Kind = Desc.getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && "AnyOf reduction is not supported."); @@ -1252,12 +1252,12 @@ Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src, Value *Iden = Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); Value *Ops[] = {Iden, Src}; - return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops); + return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createTargetReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { +Value *llvm::createReduction(IRBuilderBase &B, + const RecurrenceDescriptor &Desc, Value *Src, + PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1266,9 +1266,9 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - return createAnyOfTargetReduction(B, Src, Desc, OrigPhi); + return createAnyOfReduction(B, Src, Desc, OrigPhi); - return createSimpleTargetReduction(B, Src, RK); + return createSimpleReduction(B, Src, RK); } Value *llvm::createOrderedReduction(IRBuilderBase &B, @@ -1295,7 +1295,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd); auto *SrcTy = cast(Src->getType()); Value *Ops[] = {Start, Src}; - return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops); + return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } void llvm::propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cf802034cd56a..a7272deb3c34f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18351,7 +18351,7 @@ class HorizontalReduction { "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind); + return createSimpleReduction(Builder, VectorizedValue, RdxKind); } /// Emits optimized code for unique scalar value reused \p Cnt times. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 0d3d0febfea1b..69c76edd0f554 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -593,7 +593,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && !PhiR->isInLoop()) { ReducedPartRdx = - createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -1857,7 +1857,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { NextInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); - NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); + NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), NewRed, PrevInChain); @@ -1900,7 +1900,7 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) { if (isOrdered()) { NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); } else { - NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc); + NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); else From eaa95a1c2bd38332c1a4e634595f29d22b28ffea Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 4 Sep 2024 04:10:46 +0400 Subject: [PATCH 003/425] [clang] Add test for CWG2486 (`noexcept` and function pointer conversion) (#107131) [CWG2486](https://cplusplus.github.io/CWG/issues/2486.html) "Call to `noexcept` function via `noexcept(false)` pointer/lvalue" allows `noexcept` functions to be called via `noexcept(false)` pointers or values. There appears to be no implementation divergence whatsoever: https://godbolt.org/z/3afTfeEM8. That said, in C++14 and earlier we do not issue all the diagnostics we issue in C++17 and newer, so I'm specifying the status of the issue accordingly. --- clang/test/CXX/drs/cwg24xx.cpp | 43 ++++++++++++++++++++++++++++------ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp index 00b6bb5a865df..79e9d031ef41c 100644 --- a/clang/test/CXX/drs/cwg24xx.cpp +++ b/clang/test/CXX/drs/cwg24xx.cpp @@ -1,15 +1,11 @@ -// RUN: %clang_cc1 -std=c++98 -pedantic-errors %s -verify=expected -// RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected -// RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected +// RUN: %clang_cc1 -std=c++98 -pedantic-errors %s -verify=expected,cxx98-14 +// RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected,cxx98-14 +// RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected,cxx98-14 // RUN: %clang_cc1 -std=c++17 -pedantic-errors %s -verify=expected,since-cxx17 // RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx17 // RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx17 // RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx17 -#if __cplusplus <= 201402L -// expected-no-diagnostics -#endif - namespace cwg2406 { // cwg2406: 5 #if __cplusplus >= 201703L void fallthrough(int n) { @@ -186,3 +182,36 @@ namespace cwg2445 { // cwg2445: 19 } #endif } + +namespace cwg2486 { // cwg2486: 4 c++17 +struct C { + void fn() throw(); +}; + +static void call(C& c, void (C::*f)()) { + (c.*f)(); +} + +static void callNE(C& c, void (C::*f)() throw()) { +// cxx98-14-warning@-1 {{mangled name of 'callNE' will change in C++17 due to non-throwing exception specification in function signature}} + (c.*f)(); +} + +void ref() { + C c; + call(c, &C::fn); // <= implicit cast removes noexcept + callNE(c, &C::fn); +} + +void (*p)(); +void (*pp)() throw() = p; +// since-cxx17-error@-1 {{cannot initialize a variable of type 'void (*)() throw()' with an lvalue of type 'void (*)()': different exception specifications}} + +struct S { + typedef void (*p)(); + operator p(); // #cwg2486-conv +}; +void (*q)() throw() = S(); +// since-cxx17-error@-1 {{no viable conversion from 'S' to 'void (*)() throw()'}} +// since-cxx17-note@#cwg2486-conv {{candidate function}} +} // namespace cwg2486 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 4f4d8d0a97d43..ca25776823cfa 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -14751,7 +14751,7 @@

C++ defect report implementation status

2486 CD6 Call to noexcept function via noexcept(false) pointer/lvalue - Unknown + Clang 4 (C++17 onwards) 2487 From 83ad644afaac23577e3563d3ec1fac1b1fde37f4 Mon Sep 17 00:00:00 2001 From: Freddy Ye Date: Wed, 4 Sep 2024 08:13:24 +0800 Subject: [PATCH 004/425] [X86][AVX10.2] Support AVX10.2-BF16 new instructions. (#101603) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 --- clang/include/clang/Basic/BuiltinsX86.def | 62 + clang/lib/Basic/Targets/X86.cpp | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 23 + clang/lib/Headers/CMakeLists.txt | 2 + clang/lib/Headers/avx10_2_512bf16intrin.h | 565 +++ clang/lib/Headers/avx10_2bf16intrin.h | 1091 ++++++ clang/lib/Headers/immintrin.h | 2 + clang/lib/Sema/SemaX86.cpp | 9 + .../CodeGen/X86/avx10_2_512bf16-builtins.c | 1085 ++++++ clang/test/CodeGen/X86/avx10_2bf16-builtins.c | 2082 ++++++++++++ llvm/include/llvm/IR/IntrinsicsX86.td | 253 ++ .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 8 +- .../X86/MCTargetDesc/X86ATTInstPrinter.cpp | 12 +- .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 11 + .../X86/MCTargetDesc/X86IntelInstPrinter.cpp | 9 + llvm/lib/Target/X86/X86ISelLowering.cpp | 37 +- llvm/lib/Target/X86/X86InstrAVX10.td | 315 ++ llvm/lib/Target/X86/X86InstrAVX512.td | 4 +- llvm/lib/Target/X86/X86InstrFMA3Info.cpp | 35 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 5 + llvm/lib/Target/X86/X86InstrUtils.td | 6 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 54 + llvm/test/CodeGen/X86/avx10.2-fma-commute.ll | 1244 +++++++ .../test/CodeGen/X86/avx10_2_512bf16-arith.ll | 587 ++++ .../CodeGen/X86/avx10_2_512bf16-intrinsics.ll | 230 ++ llvm/test/CodeGen/X86/avx10_2bf16-arith.ll | 1168 +++++++ .../CodeGen/X86/avx10_2bf16-intrinsics.ll | 602 ++++ .../MC/Disassembler/X86/avx10.2-bf16-32.txt | 3015 +++++++++++++++++ .../MC/Disassembler/X86/avx10.2-bf16-64.txt | 3015 +++++++++++++++++ llvm/test/MC/X86/avx10.2-bf16-32-att.s | 3014 ++++++++++++++++ llvm/test/MC/X86/avx10.2-bf16-32-intel.s | 3014 ++++++++++++++++ llvm/test/MC/X86/avx10.2-bf16-64-att.s | 3014 ++++++++++++++++ llvm/test/MC/X86/avx10.2-bf16-64-intel.s | 3014 ++++++++++++++++ llvm/test/TableGen/x86-fold-tables.inc | 494 +++ 34 files changed, 28058 insertions(+), 24 deletions(-) create mode 100644 clang/lib/Headers/avx10_2_512bf16intrin.h create mode 100644 clang/lib/Headers/avx10_2bf16intrin.h create mode 100644 clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c create mode 100644 clang/test/CodeGen/X86/avx10_2bf16-builtins.c create mode 100644 llvm/test/CodeGen/X86/avx10.2-fma-commute.ll create mode 100644 llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll create mode 100644 llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll create mode 100644 llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt create mode 100644 llvm/test/MC/X86/avx10.2-bf16-32-att.s create mode 100644 llvm/test/MC/X86/avx10.2-bf16-32-intel.s create mode 100644 llvm/test/MC/X86/avx10.2-bf16-64-att.s create mode 100644 llvm/test/MC/X86/avx10.2-bf16-64-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index e4aa8661b9a80..48376ee052798 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:" TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") + +// AVX10.2 BF16 +TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256") + #undef BUILTIN #undef TARGET_BUILTIN #undef TARGET_HEADER_BUILTIN diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index a9cbdb7b10dff..62c382b67ad14 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAVX10_1_512 = true; } else if (Feature == "+avx10.2-256") { HasAVX10_2 = true; + HasFullBFloat16 = true; } else if (Feature == "+avx10.2-512") { HasAVX10_2_512 = true; } else if (Feature == "+avx512cd") { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e4d169d2ad603..786c2c224b349 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_storeups512_mask: return EmitX86MaskedStore(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_storesbf16128_mask: case X86::BI__builtin_ia32_storesh128_mask: case X86::BI__builtin_ia32_storess128_mask: case X86::BI__builtin_ia32_storesd128_mask: @@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vfmaddph512_mask: case X86::BI__builtin_ia32_vfmaddph512_maskz: case X86::BI__builtin_ia32_vfmaddph512_mask3: + case X86::BI__builtin_ia32_vfmaddnepbh128: + case X86::BI__builtin_ia32_vfmaddnepbh256: + case X86::BI__builtin_ia32_vfmaddnepbh512: case X86::BI__builtin_ia32_vfmaddps512_mask: case X86::BI__builtin_ia32_vfmaddps512_maskz: case X86::BI__builtin_ia32_vfmaddps512_mask3: @@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_loaddqudi512_mask: return EmitX86MaskedLoad(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_loadsbf16128_mask: case X86::BI__builtin_ia32_loadsh128_mask: case X86::BI__builtin_ia32_loadss128_mask: case X86::BI__builtin_ia32_loadsd128_mask: @@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_sqrtph256: case X86::BI__builtin_ia32_sqrtph: case X86::BI__builtin_ia32_sqrtph512: + case X86::BI__builtin_ia32_vsqrtnepbf16256: + case X86::BI__builtin_ia32_vsqrtnepbf16: + case X86::BI__builtin_ia32_vsqrtnepbf16512: case X86::BI__builtin_ia32_sqrtps512: case X86::BI__builtin_ia32_sqrtpd512: { if (Ops.size() == 2) { @@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_fpclassps128_mask: case X86::BI__builtin_ia32_fpclassps256_mask: case X86::BI__builtin_ia32_fpclassps512_mask: + case X86::BI__builtin_ia32_vfpclasspbf16128_mask: + case X86::BI__builtin_ia32_vfpclasspbf16256_mask: + case X86::BI__builtin_ia32_vfpclasspbf16512_mask: case X86::BI__builtin_ia32_fpclassph128_mask: case X86::BI__builtin_ia32_fpclassph256_mask: case X86::BI__builtin_ia32_fpclassph512_mask: @@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Intrinsic::ID ID; switch (BuiltinID) { default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_vfpclasspbf16128_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_128; + break; + case X86::BI__builtin_ia32_vfpclasspbf16256_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_256; + break; + case X86::BI__builtin_ia32_vfpclasspbf16512_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_512; + break; case X86::BI__builtin_ia32_fpclassph128_mask: ID = Intrinsic::x86_avx512fp16_fpclass_ph_128; break; @@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vcmppd256_round_mask: case X86::BI__builtin_ia32_vcmpps256_round_mask: case X86::BI__builtin_ia32_vcmpph256_round_mask: + case X86::BI__builtin_ia32_vcmppbf16512_mask: + case X86::BI__builtin_ia32_vcmppbf16256_mask: + case X86::BI__builtin_ia32_vcmppbf16128_mask: IsMaskFCmp = true; [[fallthrough]]; case X86::BI__builtin_ia32_cmpps: diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 5a62538792f30..e928b5b142827 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -147,10 +147,12 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h avx10_2_512niintrin.h avx10_2_512satcvtintrin.h + avx10_2bf16intrin.h avx10_2convertintrin.h avx10_2minmaxintrin.h avx10_2niintrin.h diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h new file mode 100644 index 0000000000000..392b7ae770c5b --- /dev/null +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -0,0 +1,565 @@ +/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512BF16INTRIN_H +#define __AVX10_2_512BF16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { + return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) { + return (__m512bh)__builtin_ia32_undef512(); +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) { + return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh( + __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, + __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17, + __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22, + __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27, + __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) { + return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25, + bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17, + bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9, + bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1}; +} + +#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ + bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \ + bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \ + bf29, bf30, bf31, bf32) \ + _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \ + (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \ + (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \ + (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \ + (bf3), (bf2), (bf1)) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_ps(__m512bh __a) { + return (__m512)__a; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_pd(__m512bh __a) { + return (__m512d)__a; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_si512(__m512bh __a) { + return (__m512i)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpd_pbh(__m512d __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castsi512_pbh(__m512i __a) { + return (__m512bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16512_pbh128(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16512_pbh256(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextpbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector( + __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextpbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) { + return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), + (__m512i)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_load_pbh(void const *__p) { + return *(const __m512bh *)__p; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P, + __m512bh __A) { + *(__m512bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P, + __m512bh __A) { + struct __storeu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W, + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { + return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_addne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A + (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_subne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A - (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mulne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A * (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_divne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A / (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_divne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_divne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)__builtin_ia32_vmaxpbf16512((__v32bf)__A, (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)__builtin_ia32_vminpbf16512((__v32bf)__A, (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +#define _mm512_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A), \ + (__v32bf)(__m512bh)(__B), \ + (int)(__P), (__mmask32) - 1)) + +#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A), \ + (__v32bf)(__m512bh)(__B), \ + (int)(__P), (__mmask32)(__U))) + +#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U))) + +#define _mm512_fpclass_pbh_mask(__A, imm) \ + ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1)) + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_scalef_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefpbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(), + (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh( + __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefpbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefpbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(), + (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vrcppbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrcppbf16512_mask((__v32bf)__A, (__v32bf)__W, + (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrcppbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_getexp_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexppbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexppbf16512_mask( + (__v32bf)__A, (__v32bf)__W, (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexppbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_rsqrt_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask( + (__v32bf)__A, (__v32bf)__W, (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +#define _mm512_reducene_pbh(__A, imm) \ + ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \ + (__mmask32) - 1)) + +#define _mm512_mask_reducene_pbh(__W, __U, __A, imm) \ + ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \ + (__mmask32)(__U))) + +#define _mm512_maskz_reducene_pbh(__U, __A, imm) \ + ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32)(__U))) + +#define _mm512_roundscalene_pbh(__A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32) - 1)) + +#define _mm512_mask_roundscalene_pbh(__W, __U, __A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \ + (__mmask32)(__U))) + +#define _mm512_maskz_roundscalene_pbh(__U, __A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32)(__U))) + +#define _mm512_getmant_pbh(__A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1)) + +#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)(__m512bh)(__W), (__mmask32)(__U))) + +#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U))) + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vsqrtnepbf16512((__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, + (__v32bf)_mm512_sqrt_pbh(__A), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fmaddne_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmaddne_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmaddne_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, + -(__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fmsubne_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsubne_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsubne_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmaddne_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmaddne_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmaddne_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, + -(__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsubne_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsubne_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsubne_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif +#endif diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h new file mode 100644 index 0000000000000..0a427b9b7418b --- /dev/null +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -0,0 +1,1091 @@ +/*===-------------- avx10_2bf16intrin.h - AVX10-BF16 intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2BF16INTRIN_H +#define __AVX10_2BF16INTRIN_H + +typedef __bf16 __m128bh_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { + return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_setzero_pbh(void) { + return __builtin_bit_cast(__m128bh, _mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castpbf16_ps(__m128bh __a) { + return (__m128)__a; +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_castpbf16_ps(__m256bh __a) { + return (__m256)__a; +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_castpbf16_pd(__m256bh __a) { + return (__m256d)__a; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castpbf16_pd(__m128bh __a) { + return (__m128d)__a; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_castpbf16_si128(__m128bh __a) { + return (__m128i)__a; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_castpbf16_si256(__m256bh __a) { + return (__m256i)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castps_pbh(__m128 __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_castps_pbh(__m256 __a) { + return (__m256bh)__a; +} + +static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtsbh_bf16(__m128bh __a) { + return __a[0]; +} + +static __inline__ __bf16 __DEFAULT_FN_ATTRS256 +_mm256_cvtsbh_bf16(__m256bh __a) { + return __a[0]; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castpd_pbh(__m128d __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castpd_pbh(__m256d __a) { + return (__m256bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_castsi128_pbh(__m128i __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castsi256_pbh(__m256i __a) { + return (__m256bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_castpbf16256_pbh128(__m256bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castpbf16128_pbh256(__m128bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_zextpbf16128_pbh256(__m128bh __a) { + return __builtin_shufflevector(__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_undefined_pbh(void) { + return (__m256bh)__builtin_ia32_undef256(); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_load_sbh(void const *__dp) { + __m128bh src = (__v8bf)_mm_setzero_pbh(); + return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__dp, src, + 1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_load_sbh(__m128bh __W, __mmask8 __U, const void *__A) { + __m128bh src = (__v8bf)__builtin_shufflevector( + (__v8bf)__W, (__v8bf)_mm_setzero_pbh(), 0, 8, 8, 8, 8, 8, 8, 8); + + return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__A, src, + __U & 1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_load_sbh(__mmask8 __U, const void *__A) { + return (__m128bh)__builtin_ia32_loadsbf16128_mask( + (const __v8bf *)__A, (__v8bf)_mm_setzero_pbh(), __U & 1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_load_pbh(void const *__p) { + return *(const __m256bh *)__p; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_load_pbh(void const *__p) { + return *(const __m128bh *)__p; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m256bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m128bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sbh(void *__dp, + __m128bh __a) { + struct __mm_store_sbh_struct { + __bf16 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sbh_struct *)__dp)->__u = __a[0]; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sbh(void *__W, + __mmask8 __U, + __m128bh __A) { + __builtin_ia32_storesbf16128_mask((__v8bf *)__W, __A, __U & 1); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_pbh(void *__P, + __m256bh __A) { + *(__m256bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_pbh(void *__P, + __m128bh __A) { + *(__m128bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_pbh(void *__P, + __m256bh __A) { + struct __storeu_pbh { + __m256bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_pbh(void *__P, + __m128bh __A) { + struct __storeu_pbh { + __m128bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, + __m128bh __b) { + __a[0] = __b[0]; + return __a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), + _mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_undefined_pbh(void) { + return (__m128bh)__builtin_ia32_undef128(); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_set_sbh(__bf16 bf) { + return (__v8bf)__builtin_shufflevector( + (__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}, (__v8bf)_mm_setzero_pbh(), 0, 8, + 8, 8, 8, 8, 8, 8); +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_set1_pbh(__bf16 bf) { + return (__m128bh)(__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set1_pbh(__bf16 bf) { + return (__m256bh)(__v16bf){bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 +_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, + __bf16 bf6, __bf16 bf7, __bf16 bf8) { + return (__m128bh)(__v8bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8}; +} + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set_pbh( + __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, + __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) { + return (__m256bh)(__v16bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16}; +} + +#define _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8) \ + _mm_set_pbh((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1)) + +#define _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ + bf11, bf12, bf13, bf14, bf15, bf16) \ + _mm256_set_pbh((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10), \ + (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), \ + (bf1)) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_abs_pbh(__m256bh __A) { + return (__m256bh)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), + (__m256i)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_abs_pbh(__m128bh __A) { + return (__m128bh)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, (__v8bf)__W, + (__v8bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, (__v16bf)__W, + (__v16bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { + return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { + return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_permutexvar_pbh(__m128i __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_addne_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)((__v16bf)__A + (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_addne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_addne_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_addne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_addne_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_addne_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A + (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_addne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_addne_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_addne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_addne_pbh(__A, __B), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_subne_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)((__v16bf)__A - (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_subne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_subne_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_subne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_subne_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_subne_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A - (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_subne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_subne_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_subne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_subne_pbh(__A, __B), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mulne_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)((__v16bf)__A * (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_mulne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_mulne_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_mulne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_mulne_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mulne_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A * (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_mulne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_mulne_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_mulne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_mulne_pbh(__A, __B), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_divne_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)((__v16bf)__A / (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_divne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_divne_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_divne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_divne_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_divne_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A / (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_divne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_divne_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_divne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_divne_pbh(__A, __B), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_max_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)__builtin_ia32_vmaxpbf16256((__v16bf)__A, (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_max_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vmaxpbf16128((__v8bf)__A, (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_max_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_max_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_min_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)__builtin_ia32_vminpbf16256((__v16bf)__A, (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_min_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vminpbf16128((__v8bf)__A, (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_min_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comeqsbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16eq((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comltsbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16lt((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comlesbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16le((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comgtsbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16gt((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comgesbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16ge((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comneqsbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomsbf16neq((__v8bf)A, (__v8bf)B); +} + +#define _mm256_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask16)__builtin_ia32_vcmppbf16256_mask((__v16bf)(__m256bh)(__A), \ + (__v16bf)(__m256bh)(__B), \ + (int)(__P), (__mmask16) - 1)) + +#define _mm256_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask16)__builtin_ia32_vcmppbf16256_mask((__v16bf)(__m256bh)(__A), \ + (__v16bf)(__m256bh)(__B), \ + (int)(__P), (__mmask16)(__U))) + +#define _mm_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask8)__builtin_ia32_vcmppbf16128_mask((__v8bf)(__m128bh)(__A), \ + (__v8bf)(__m128bh)(__B), \ + (int)(__P), (__mmask8) - 1)) + +#define _mm_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask8)__builtin_ia32_vcmppbf16128_mask((__v8bf)(__m128bh)(__A), \ + (__v8bf)(__m128bh)(__B), \ + (int)(__P), (__mmask8)(__U))) + +#define _mm256_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask16)__builtin_ia32_vfpclasspbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16)(__U))) + +#define _mm256_fpclass_pbh_mask(__A, imm) \ + ((__mmask16)__builtin_ia32_vfpclasspbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16) - 1)) + +#define _mm_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask8)__builtin_ia32_vfpclasspbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__mmask8)(__U))) + +#define _mm_fpclass_pbh_mask(__A, imm) \ + ((__mmask8)__builtin_ia32_vfpclasspbf16128_mask((__v8bf)(__m128bh)(__A), \ + (int)(imm), (__mmask8) - 1)) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_scalef_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefpbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_undefined_pbh(), + (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pbh( + __m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefpbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)__W, (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefpbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_setzero_pbh(), + (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_scalef_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefpbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefpbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)__W, (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefpbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_rcp_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vrcppbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrcppbf16256_mask((__v16bf)__A, (__v16bf)__W, + (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrcppbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rcp_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vrcppbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrcppbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrcppbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_getexp_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexppbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexppbf16256_mask( + (__v16bf)__A, (__v16bf)__W, (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexppbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_getexp_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexppbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexppbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexppbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_rsqrt_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask( + (__v16bf)__A, (__v16bf)__W, (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rsqrt_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +#define _mm256_reducene_pbh(__A, imm) \ + ((__m256bh)__builtin_ia32_vreducenepbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_undefined_pbh(), \ + (__mmask16) - 1)) + +#define _mm256_mask_reducene_pbh(__W, __U, __A, imm) \ + ((__m256bh)__builtin_ia32_vreducenepbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ + (__mmask16)(__U))) + +#define _mm256_maskz_reducene_pbh(__U, __A, imm) \ + ((__m256bh)__builtin_ia32_vreducenepbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16)(__U))) + +#define _mm_reducene_pbh(__A, imm) \ + ((__m128bh)__builtin_ia32_vreducenepbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_undefined_pbh(), \ + (__mmask8) - 1)) + +#define _mm_mask_reducene_pbh(__W, __U, __A, imm) \ + ((__m128bh)__builtin_ia32_vreducenepbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ + (__mmask8)(__U))) + +#define _mm_maskz_reducene_pbh(__U, __A, imm) \ + ((__m128bh)__builtin_ia32_vreducenepbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8)(__U))) + +#define _mm256_roundscalene_pbh(__A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16) - 1)) + +#define _mm256_mask_roundscalene_pbh(__W, __U, __A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ + (__mmask16)(__U))) + +#define _mm256_maskz_roundscalene_pbh(__U, __A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16)(__U))) + +#define _mm_roundscalene_pbh(__A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8) - 1)) + +#define _mm_mask_roundscalene_pbh(__W, __U, __A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ + (__mmask8)(__U))) + +#define _mm_maskz_roundscalene_pbh(__U, __A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8)(__U))) + +#define _mm256_getmant_pbh(__A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)_mm256_undefined_pbh(), (__mmask16) - 1)) + +#define _mm256_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)(__m256bh)(__W), (__mmask16)(__U))) + +#define _mm256_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)_mm256_setzero_pbh(), (__mmask16)(__U))) + +#define _mm_getmant_pbh(__A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)_mm_undefined_pbh(), (__mmask8) - 1)) + +#define _mm_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)(__m128bh)(__W), (__mmask8)(__U))) + +#define _mm_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U))) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vsqrtnepbf16256((__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_sqrt_pbh(__A), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, + (__v16bf)_mm256_sqrt_pbh(__A), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vsqrtnepbf16((__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddne_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddne_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddne_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, + -(__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubne_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubne_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubne_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmaddne_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmaddne_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmaddne_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, + -(__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsubne_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsubne_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsubne_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmaddne_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsubne_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, + -(__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fnmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, + -(__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif +#endif diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index a922056622e79..30fcc028958f3 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -649,6 +649,7 @@ _storebe_i64(void * __P, long long __D) { #endif #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__) +#include #include #include #include @@ -656,6 +657,7 @@ _storebe_i64(void * __P, long long __D) { #endif #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__) +#include #include #include #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 311e574537059..233a068c8574c 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -875,6 +875,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_rndscaleps_mask: case X86::BI__builtin_ia32_rndscalepd_mask: case X86::BI__builtin_ia32_rndscaleph_mask: + case X86::BI__builtin_ia32_vrndscalenepbf16_128_mask: + case X86::BI__builtin_ia32_vrndscalenepbf16_256_mask: + case X86::BI__builtin_ia32_vrndscalenepbf16_mask: case X86::BI__builtin_ia32_reducepd128_mask: case X86::BI__builtin_ia32_reducepd256_mask: case X86::BI__builtin_ia32_reducepd512_mask: @@ -884,6 +887,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_reduceph128_mask: case X86::BI__builtin_ia32_reduceph256_mask: case X86::BI__builtin_ia32_reduceph512_mask: + case X86::BI__builtin_ia32_vreducenepbf16128_mask: + case X86::BI__builtin_ia32_vreducenepbf16256_mask: + case X86::BI__builtin_ia32_vreducenepbf16512_mask: case X86::BI__builtin_ia32_vreducepd256_round_mask: case X86::BI__builtin_ia32_vreduceps256_round_mask: case X86::BI__builtin_ia32_vreduceph256_round_mask: @@ -911,6 +917,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_fpclassph128_mask: case X86::BI__builtin_ia32_fpclassph256_mask: case X86::BI__builtin_ia32_fpclassph512_mask: + case X86::BI__builtin_ia32_vfpclasspbf16128_mask: + case X86::BI__builtin_ia32_vfpclasspbf16256_mask: + case X86::BI__builtin_ia32_vfpclasspbf16512_mask: case X86::BI__builtin_ia32_fpclasssd_mask: case X86::BI__builtin_ia32_fpclassss_mask: case X86::BI__builtin_ia32_fpclasssh_mask: diff --git a/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c new file mode 100644 index 0000000000000..b00859c174fba --- /dev/null +++ b/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c @@ -0,0 +1,1085 @@ +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2-512 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +#include + +__m512bh test_mm512_setzero_pbh() { + // CHECK-LABEL: @test_mm512_setzero_pbh + // CHECK: zeroinitializer + return _mm512_setzero_pbh(); +} + +__m512bh test_mm512_undefined_pbh(void) { + // CHECK-LABEL: @test_mm512_undefined_pbh + // CHECK: ret <32 x bfloat> zeroinitializer + return _mm512_undefined_pbh(); +} + +__m512bh test_mm512_set1_pbh(__bf16 h) { + // CHECK-LABEL: @test_mm512_set1_pbh + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31 + return _mm512_set1_pbh(h); +} + +__m512bh test_mm512_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8, + __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, + __bf16 bf17, __bf16 bf18, __bf16 bf19, __bf16 bf20, + __bf16 bf21, __bf16 bf22, __bf16 bf23, __bf16 bf24, + __bf16 bf25, __bf16 bf26, __bf16 bf27, __bf16 bf28, + __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) { + // CHECK-LABEL: @test_mm512_set_pbh + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31 + return _mm512_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16, + bf17, bf18, bf19, bf20, bf21, bf22, bf23, bf24, + bf25, bf26, bf27, bf28, bf29, bf30, bf31, bf32); +} + +__m512bh test_mm512_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8, + __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, + __bf16 bf17, __bf16 bf18, __bf16 bf19, __bf16 bf20, + __bf16 bf21, __bf16 bf22, __bf16 bf23, __bf16 bf24, + __bf16 bf25, __bf16 bf26, __bf16 bf27, __bf16 bf28, + __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) { + // CHECK-LABEL: @test_mm512_setr_pbh + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30 + // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31 + return _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16, + bf17, bf18, bf19, bf20, bf21, bf22, bf23, bf24, + bf25, bf26, bf27, bf28, bf29, bf30, bf31, bf32); +} + +__m512 test_mm512_castpbf16_ps(__m512bh A) { + // CHECK-LABEL: test_mm512_castpbf16_ps + // CHECK: bitcast <32 x bfloat> %{{.*}} to <16 x float> + return _mm512_castpbf16_ps(A); +} + +__m512d test_mm512_castpbf16_pd(__m512bh A) { + // CHECK-LABEL: test_mm512_castpbf16_pd + // CHECK: bitcast <32 x bfloat> %{{.*}} to <8 x double> + return _mm512_castpbf16_pd(A); +} + +__m512i test_mm512_castpbf16_si512(__m512bh A) { + // CHECK-LABEL: test_mm512_castpbf16_si512 + // CHECK: bitcast <32 x bfloat> %{{.*}} to <8 x i64> + return _mm512_castpbf16_si512(A); +} + +__m512bh test_mm512_castps_pbh(__m512 A) { + // CHECK-LABEL: test_mm512_castps_pbh + // CHECK: bitcast <16 x float> %{{.*}} to <32 x bfloat> + return _mm512_castps_pbh(A); +} + +__m512bh test_mm512_castpd_pbh(__m512d A) { + // CHECK-LABEL: test_mm512_castpd_pbh + // CHECK: bitcast <8 x double> %{{.*}} to <32 x bfloat> + return _mm512_castpd_pbh(A); +} + +__m512bh test_mm512_castsi512_pbh(__m512i A) { + // CHECK-LABEL: test_mm512_castsi512_pbh + // CHECK: bitcast <8 x i64> %{{.*}} to <32 x bfloat> + return _mm512_castsi512_pbh(A); +} + +__m128bh test_mm512_castpbf16512_pbh128(__m512bh __a) { + // CHECK-LABEL: test_mm512_castpbf16512_pbh128 + // CHECK: shufflevector <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <8 x i32> + return _mm512_castpbf16512_pbh128(__a); +} + +__m256bh test_mm512_castpbf16512_pbh256(__m512bh __a) { + // CHECK-LABEL: test_mm512_castpbf16512_pbh256 + // CHECK: shufflevector <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <16 x i32> + return _mm512_castpbf16512_pbh256(__a); +} + +__m512bh test_mm512_castpbf16128_pbh512(__m128bh __a) { + // CHECK-LABEL: test_mm512_castpbf16128_pbh512 + // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <32 x i32> + return _mm512_castpbf16128_pbh512(__a); +} + +__m512bh test_mm512_castpbf16256_pbh512(__m256bh __a) { + // CHECK-LABEL: test_mm512_castpbf16256_pbh512 + // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <32 x i32> + return _mm512_castpbf16256_pbh512(__a); +} + +__m512bh test_mm512_zextpbf16128_pbh512(__m128bh __a) { + // CHECK-LABEL: test_mm512_zextpbf16128_pbh512 + // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> {{.*}}, <32 x i32> + return _mm512_zextpbf16128_pbh512(__a); +} + +__m512bh test_mm512_zextpbf16256_pbh512(__m256bh __a) { + // CHECK-LABEL: test_mm512_zextpbf16256_pbh512 + // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> {{.*}}, <32 x i32> + return _mm512_zextpbf16256_pbh512(__a); +} + +__m512bh test_mm512_abs_pbh(__m512bh a) { + // CHECK-LABEL: @test_mm512_abs_pbh + // CHECK: and <16 x i32> + return _mm512_abs_pbh(a); +} + +// VMOVSH + +__m512bh test_mm512_load_pbh(void *p) { + // CHECK-LABEL: @test_mm512_load_pbh + // CHECK: load <32 x bfloat>, ptr %{{.*}}, align 64 + return _mm512_load_pbh(p); +} + +__m512bh test_mm512_loadu_pbh(void *p) { + // CHECK-LABEL: @test_mm512_loadu_pbh + // CHECK: load <32 x bfloat>, ptr {{.*}}, align 1{{$}} + return _mm512_loadu_pbh(p); +} + +void test_mm512_store_pbh(void *p, __m512bh a) { + // CHECK-LABEL: @test_mm512_store_pbh + // CHECK: store <32 x bfloat> %{{.*}}, ptr %{{.*}}, align 64 + _mm512_store_pbh(p, a); +} + +void test_mm512_storeu_pbh(void *p, __m512bh a) { + // CHECK-LABEL: @test_mm512_storeu_pbh + // CHECK: store <32 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm512_storeu_pbh(p, a); +} + +__m512bh test_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { + // CHECK-LABEL: @test_mm512_mask_blend_pbh + // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1> + // CHECK: %{{.*}} = select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_blend_pbh(__U, __A, __W); +} + +__m512bh test_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { + // CHECK-LABEL: @test_mm512_permutex2var_pbh + // CHECK: %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x bfloat> + return _mm512_permutex2var_pbh(__A, __I, __B); +} + +__m512bh test_mm512_permutexvar_epi16(__m512i __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_permutexvar_epi16 + // CHECK: %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x bfloat> + return _mm512_permutexvar_pbh(__A, __B); +} + +__m512bh test_mm512_addne_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_addne_pbh + // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_addne_pbh(__A, __B); +} + +__m512bh test_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_addne_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_addne_pbh(__U, __A, __B); +} + +__m512bh test_mm512_subne_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_subne_pbh + // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_subne_pbh(__A, __B); +} + +__m512bh test_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_subne_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_subne_pbh(__U, __A, __B); +} + +__m512bh test_mm512_mulne_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_mulne_pbh + // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_mulne_pbh(__A, __B); +} + +__m512bh test_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_mulne_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_mulne_pbh(__U, __A, __B); +} + +__m512bh test_mm512_divne_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_divne_pbh + // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_divne_pbh(__A, __B); +} + +__m512bh test_mm512_mask_divne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_divne_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_divne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_divne_pbh(__U, __A, __B); +} + +__m512bh test_mm512_max_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_max_pbh + // CHECK: @llvm.x86.avx10.vmaxpbf16512( + return _mm512_max_pbh(__A, __B); +} + +__m512bh test_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16512 + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_max_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16512 + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_max_pbh(__U, __A, __B); +} + +__m512bh test_mm512_min_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_min_pbh + // CHECK: @llvm.x86.avx10.vminpbf16512( + return _mm512_min_pbh(__A, __B); +} + +__m512bh test_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16512 + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_min_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16512 + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_min_pbh(__U, __A, __B); +} + +__mmask32 test_mm512_cmp_pbh_mask_eq_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: @test_mm512_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_lt_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_le_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_le_os + // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_unord_q(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_cmp_pbh_mask_neq_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_nlt_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_nle_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_ord_q(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_cmp_pbh_mask_eq_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_nge_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_ngt_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_false_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_false_oq + // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_neq_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_ge_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_gt_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_true_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_true_uq + // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_eq_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_lt_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_le_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_unord_s(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_cmp_pbh_mask_neq_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_us + // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_nlt_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_nle_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_ord_s(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_cmp_pbh_mask_eq_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_cmp_pbh_mask_nge_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_ngt_uq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_false_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_false_os + // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_neq_os(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_os + // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_cmp_pbh_mask_ge_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_gt_oq(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_cmp_pbh_mask_true_us(__m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_cmp_pbh_mask_true_us + // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_cmp_pbh_mask(a, b, _CMP_TRUE_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_eq_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: @test_mm512_mask_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_lt_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_le_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_le_os + // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_unord_q(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_neq_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nlt_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nle_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ord_q(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_eq_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nge_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ngt_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_false_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_false_oq + // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_neq_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ge_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_gt_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_true_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_true_uq + // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_eq_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_lt_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_le_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_unord_s(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_neq_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_us + // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nlt_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nle_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ord_s(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_eq_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_nge_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ngt_uq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_false_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_false_os + // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_neq_os(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_os + // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_ge_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_gt_oq(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_mask_cmp_pbh_mask_true_us(__mmask32 m, __m512bh a, __m512bh b) { + // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_true_us + // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> %{{.*}}, %{{.*}} + return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask32 test_mm512_mask_fpclass_pbh_mask(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.512 + return _mm512_mask_fpclass_pbh_mask(__U, __A, 4); +} + +__mmask32 test_mm512_fpclass_pbh_mask(__m512bh __A) { + // CHECK-LABEL: @test_mm512_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.512 + return _mm512_fpclass_pbh_mask(__A, 4); +} + +__m512bh test_mm512_scalef_pbh(__m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512 + return _mm512_scalef_pbh(__A, __B); +} + +__m512bh test_mm512_mask_scalef_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_mask_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512 + return _mm512_mask_scalef_pbh(__W, __U, __A, __B); +} + +__m512bh test_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + // CHECK-LABEL: @test_mm512_maskz_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512 + return _mm512_maskz_scalef_pbh(__U, __A, __B); +} + +__m512bh test_mm512_rcp_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512 + return _mm512_rcp_pbh(__A); +} + +__m512bh test_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512 + return (__m512bh)_mm512_mask_rcp_pbh(__W, __U, __A); +} + +__m512bh test_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512 + return _mm512_maskz_rcp_pbh(__U, __A); +} + +__m512bh test_mm512_getexp_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512 + return _mm512_getexp_pbh(__A); +} + +__m512bh test_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512 + return _mm512_mask_getexp_pbh(__W, __U, __A); +} + +__m512bh test_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512 + return _mm512_maskz_getexp_pbh(__U, __A); +} + +__m512bh test_mm512_rsqrt_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512 + return _mm512_rsqrt_pbh(__A); +} + +__m512bh test_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512 + return (__m512bh)_mm512_mask_rsqrt_pbh(__W, __U, __A); +} + +__m512bh test_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512 + return _mm512_maskz_rsqrt_pbh(__U, __A); +} + +__m512bh test_mm512_reducene_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512 + return _mm512_reducene_pbh(__A, 3); +} + +__m512bh test_mm512_mask_reducene_pbh(__m512bh __W, __mmask16 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512 + return _mm512_mask_reducene_pbh(__W, __U, __A, 1); +} + +__m512bh test_mm512_maskz_reducene_pbh(__mmask16 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512 + return _mm512_maskz_reducene_pbh(__U, __A, 1); +} + +__m512bh test_mm512_roundscalene_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512 + return _mm512_roundscalene_pbh(__A, 3); +} + +__m512bh test_mm512_mask_roundscalene_pbh(__m512bh __W, __mmask16 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512 + return _mm512_mask_roundscalene_pbh(__W, __U, __A, 1); +} + +__m512bh test_mm512_maskz_roundscalene_pbh(__mmask16 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512 + return _mm512_maskz_roundscalene_pbh(__U, __A, 1 ); +} + +__m512bh test_mm512_getmant_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512 + return _mm512_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512bh test_mm512_mask_getmant_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512 + return _mm512_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512bh test_mm512_maskz_getmant_pbh(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512 + return _mm512_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512bh test_mm512_sqrt_pbh(__m512bh __A) { + // CHECK-LABEL: @test_mm512_sqrt_pbh + // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}}) + return _mm512_sqrt_pbh(__A); +} + +__m512bh test_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_mask_sqrt_pbh + // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return (__m512bh)_mm512_mask_sqrt_pbh(__W, __U, __A); +} + +__m512bh test_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) { + // CHECK-LABEL: @test_mm512_maskz_sqrt_pbh + // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_sqrt_pbh(__U, __A); +} + +__m512bh test_mm512_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_fmaddne_pbh + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + return _mm512_fmaddne_pbh(__A, __B, __C); +} + +__m512bh test_mm512_mask_fmaddne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_mask_fmaddne_pbh + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_fmaddne_pbh(__A, __U, __B, __C); +} + +__m512bh test_mm512_mask3_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmaddne_pbh + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask3_fmaddne_pbh(__A, __B, __C, __U); +} + +__m512bh test_mm512_maskz_fmaddne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_maskz_fmaddne_pbh + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_fmaddne_pbh(__U, __A, __B, __C); +} + +__m512bh test_mm512_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_fmsubne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + return _mm512_fmsubne_pbh(__A, __B, __C); +} + +__m512bh test_mm512_mask_fmsubne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_mask_fmsubne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_fmsubne_pbh(__A, __U, __B, __C); +} + +__m512bh test_mm512_mask3_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmsubne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask3_fmsubne_pbh(__A, __B, __C, __U); +} + +__m512bh test_mm512_maskz_fmsubne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_maskz_fmsubne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_fmsubne_pbh(__U, __A, __B, __C); +} + +__m512bh test_mm512_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + return _mm512_fnmaddne_pbh(__A, __B, __C); +} + +__m512bh test_mm512_mask_fnmaddne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_mask_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_fnmaddne_pbh(__A, __U, __B, __C); +} + +__m512bh test_mm512_mask3_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask3_fnmaddne_pbh(__A, __B, __C, __U); +} + +__m512bh test_mm512_maskz_fnmaddne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_fnmaddne_pbh(__U, __A, __B, __C); +} + +__m512bh test_mm512_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + return _mm512_fnmsubne_pbh(__A, __B, __C); +} + +__m512bh test_mm512_mask_fnmsubne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_mask_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask_fnmsubne_pbh(__A, __U, __B, __C); +} + +__m512bh test_mm512_mask3_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_mask3_fnmsubne_pbh(__A, __B, __C, __U); +} + +__m512bh test_mm512_maskz_fnmsubne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}) + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + return _mm512_maskz_fnmsubne_pbh(__U, __A, __B, __C); +} diff --git a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c new file mode 100644 index 0000000000000..cd94edcf58ea2 --- /dev/null +++ b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c @@ -0,0 +1,2082 @@ +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2-256 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +#include + +__m256bh test_mm256_setzero_pbh() { + // CHECK-LABEL: @test_mm256_setzero_pbh + // CHECK: zeroinitializer + return _mm256_setzero_pbh(); +} + +__m128bh test_mm_setzero_pbh() { + // CHECK-LABEL: @test_mm_setzero_pbh + // CHECK: zeroinitializer + return _mm_setzero_pbh(); +} + +__m256bh test_mm256_undefined_pbh(void) { + // CHECK-LABEL: @test_mm256_undefined_pbh + // CHECK: ret <16 x bfloat> zeroinitializer + return _mm256_undefined_pbh(); +} + +__m128bh test_mm_undefined_pbh(void) { + // CHECK-LABEL: @test_mm_undefined_pbh + // CHECK: ret <8 x bfloat> zeroinitializer + return _mm_undefined_pbh(); +} + +__bf16 test_mm_cvtsbh_bf16(__m128bh __A) { + // CHECK-LABEL: @test_mm_cvtsbh_bf16 + // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0 + return _mm_cvtsbh_bf16(__A); +} + +__bf16 test_mm256_cvtsbh_bf16(__m256bh __A) { + // CHECK-LABEL: @test_mm256_cvtsbh_bf16 + // CHECK: extractelement <16 x bfloat> %{{.*}}, i32 0 + return _mm256_cvtsbh_bf16(__A); +} + +__m128bh test_mm_set_sbh(__bf16 h) { + // CHECK-LABEL: @test_mm_set_sbh + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 1 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 2 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 3 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 4 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 5 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 6 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 7 + return _mm_set_sbh(h); +} + +__m128bh test_mm_set1_pbh(__bf16 h) { + // CHECK-LABEL: @test_mm_set1_pbh + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7 + return _mm_set1_pbh(h); +} + +__m256bh test_mm256_set1_pbh(__bf16 h) { + // CHECK-LABEL: @test_mm256_set1_pbh + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15 + return _mm256_set1_pbh(h); +} + +__m128bh test_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8) { + // CHECK-LABEL: @test_mm_set_pbh + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7 + return _mm_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8); +} + +__m256bh test_mm256_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8, + __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) { + // CHECK-LABEL: @test_mm256_set_pbh + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15 + return _mm256_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16); +} + +__m128bh test_mm_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8) { + // CHECK-LABEL: @test_mm_setr_pbh + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7 + return _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8); +} + +__m256bh test_mm256_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, + __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8, + __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) { + // CHECK-LABEL: @test_mm256_setr_pbh + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14 + // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15 + return _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16); +} + +__m128 test_mm_castpbf16_ps(__m128bh A) { + // CHECK-LABEL: test_mm_castpbf16_ps + // CHECK: bitcast <8 x bfloat> %{{.*}} to <4 x float> + return _mm_castpbf16_ps(A); +} + +__m256 test_mm256_castpbf16_ps(__m256bh A) { + // CHECK-LABEL: test_mm256_castpbf16_ps + // CHECK: bitcast <16 x bfloat> %{{.*}} to <8 x float> + return _mm256_castpbf16_ps(A); +} + +__m128i test_mm_castpbf16_si128(__m128bh A) { + // CHECK-LABEL: test_mm_castpbf16_si128 + // CHECK: bitcast <8 x bfloat> %{{.*}} to <2 x i64> + return _mm_castpbf16_si128(A); +} + +__m256i test_mm256_castpbf16_si256(__m256bh A) { + // CHECK-LABEL: test_mm256_castpbf16_si256 + // CHECK: bitcast <16 x bfloat> %{{.*}} to <4 x i64> + return _mm256_castpbf16_si256(A); +} + +__m128bh test_mm_castps_pbh(__m128 A) { + // CHECK-LABEL: test_mm_castps_pbh + // CHECK: bitcast <4 x float> %{{.*}} to <8 x bfloat> + return _mm_castps_pbh(A); +} + +__m256bh test_mm256_castps_pbh(__m256 A) { + // CHECK-LABEL: test_mm256_castps_pbh + // CHECK: bitcast <8 x float> %{{.*}} to <16 x bfloat> + return _mm256_castps_pbh(A); +} + +__m128bh test_mm_castpd_pbh(__m128d A) { + // CHECK-LABEL: test_mm_castpd_pbh + // CHECK: bitcast <2 x double> %{{.*}} to <8 x bfloat> + return _mm_castpd_pbh(A); +} + +__m256bh test_mm256_castpd_pbh(__m256d A) { + // CHECK-LABEL: test_mm256_castpd_pbh + // CHECK: bitcast <4 x double> %{{.*}} to <16 x bfloat> + return _mm256_castpd_pbh(A); +} + +__m128bh test_mm_castsi128_pbh(__m128i A) { + // CHECK-LABEL: test_mm_castsi128_pbh + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x bfloat> + return _mm_castsi128_pbh(A); +} + +__m256bh test_mm256_castsi256_pbh(__m256i A) { + // CHECK-LABEL: test_mm256_castsi256_pbh + // CHECK: bitcast <4 x i64> %{{.*}} to <16 x bfloat> + return _mm256_castsi256_pbh(A); +} + +__m128d test_mm_castpbf16_pd(__m128bh A) { + // CHECK-LABEL: test_mm_castpbf16_pd + // CHECK: bitcast <8 x bfloat> %{{.*}} to <2 x double> + return _mm_castpbf16_pd(A); +} + +__m128bh test_mm256_castpbf16256_pbh128(__m256bh __a) { + // CHECK-LABEL: test_mm256_castpbf16256_pbh128 + // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <8 x i32> + return _mm256_castpbf16256_pbh128(__a); +} + +__m256bh test_mm256_castpbf16128_pbh256(__m128bh __a) { + // CHECK-LABEL: test_mm256_castpbf16128_pbh256 + // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <16 x i32> + return _mm256_castpbf16128_pbh256(__a); +} + +__m256d test_mm256_castpbf16_pd(__m256bh A) { + // CHECK-LABEL: test_mm256_castpbf16_pd + // CHECK: bitcast <16 x bfloat> %{{.*}} to <4 x double> + return _mm256_castpbf16_pd(A); +} + +__m256bh test_mm256_zextpbf16128_pbh256(__m128bh __a) { + // CHECK-LABEL: test_mm256_zextpbf16128_pbh256 + // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> {{.*}}, <16 x i32> + return _mm256_zextpbf16128_pbh256(__a); +} + +__m128bh test_mm_abs_pbh(__m128bh a) { + // CHECK-LABEL: @test_mm_abs_pbh + // CHECK: and <4 x i32> + return _mm_abs_pbh(a); +} + +__m256bh test_mm256_abs_pbh(__m256bh a) { + // CHECK-LABEL: @test_mm256_abs_pbh + // CHECK: and <8 x i32> + return _mm256_abs_pbh(a); +} + +__m256bh test_mm256_loadu_pbh(void *p) { + // CHECK-LABEL: @test_mm256_loadu_pbh + // CHECK: load <16 x bfloat>, ptr {{.*}}, align 1{{$}} + return _mm256_loadu_pbh(p); +} + +__m128bh test_mm_load_sbh(void const *A) { + // CHECK-LABEL: test_mm_load_sbh + // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> bitcast (<1 x i8> to <8 x i1>), <8 x bfloat> %{{.*}}) + return _mm_load_sbh(A); +} + +__m256bh test_mm256_load_pbh(void *p) { + // CHECK-LABEL: @test_mm256_load_pbh + // CHECK: load <16 x bfloat>, ptr %{{.*}}, align 32 + return _mm256_load_pbh(p); +} + +__m128bh test_mm_load_pbh(void *p) { + // CHECK-LABEL: @test_mm_load_pbh + // CHECK: load <8 x bfloat>, ptr %{{.*}}, align 16 + return _mm_load_pbh(p); +} + +__m128bh test_mm_loadu_pbh(void *p) { + // CHECK-LABEL: @test_mm_loadu_pbh + // CHECK: load <8 x bfloat>, ptr {{.*}}, align 1{{$}} + return _mm_loadu_pbh(p); +} + +void test_mm_store_sbh(void *A, __m128bh B) { + // CHECK-LABEL: test_mm_store_sbh + // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0 + // CHECK: store bfloat %{{.*}}, ptr %{{.*}}, align 1{{$}} + _mm_store_sbh(A, B); +} + +void test_mm_mask_store_sbh(void *__P, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_store_sbh + // CHECK: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}) + _mm_mask_store_sbh(__P, __U, __A); +} + +void test_mm256_store_pbh(void *p, __m256bh a) { + // CHECK-LABEL: @test_mm256_store_pbh + // CHECK: store <16 x bfloat> %{{.*}}, ptr %{{.*}}, align 32 + _mm256_store_pbh(p, a); +} + +void test_mm_store_pbh(void *p, __m128bh a) { + // CHECK-LABEL: @test_mm_store_pbh + // CHECK: store <8 x bfloat> %{{.*}}, ptr %{{.*}}, align 16 + _mm_store_pbh(p, a); +} + +__m128bh test_mm_mask_load_sbh(__m128bh __A, __mmask8 __U, const void *__W) { + // CHECK-LABEL: @test_mm_mask_load_sbh + // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_mask_load_sbh(__A, __U, __W); +} + +__m128bh test_mm_maskz_load_sbh(__mmask8 __U, const void *__W) { + // CHECK-LABEL: @test_mm_maskz_load_sbh + // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_maskz_load_sbh(__U, __W); +} + +void test_mm256_storeu_pbh(void *p, __m256bh a) { + // CHECK-LABEL: @test_mm256_storeu_pbh + // CHECK: store <16 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm256_storeu_pbh(p, a); +} + +void test_mm_storeu_pbh(void *p, __m128bh a) { + // CHECK-LABEL: @test_mm_storeu_pbh + // CHECK: store <8 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm_storeu_pbh(p, a); +} + +__m128bh test_mm_move_sbh(__m128bh A, __m128bh B) { + // CHECK-LABEL: test_mm_move_sbh + // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 0 + return _mm_move_sbh(A, B); +} + +__m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_mask_move_sbh + // CHECK: [[EXT:%.*]] = extractelement <8 x bfloat> %{{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <8 x bfloat> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <8 x bfloat> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, bfloat [[A]], bfloat [[B]] + // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0 + return _mm_mask_move_sbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_maskz_move_sbh + // CHECK: [[EXT:%.*]] = extractelement <8 x bfloat> %{{.*}}, i32 0 + // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <8 x bfloat> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <8 x bfloat> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, bfloat [[A]], bfloat [[B]] + // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0 + return _mm_maskz_move_sbh(__U, __A, __B); +} + +__m128bh test_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) { + // CHECK-LABEL: @test_mm_mask_blend_pbh + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask_blend_pbh(__U, __A, __W); +} + +__m256bh test_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { + // CHECK-LABEL: @test_mm256_mask_blend_pbh + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask_blend_pbh(__U, __A, __W); +} + +__m128bh test_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { + // CHECK-LABEL: @test_mm_permutex2var_pbh + // CHECK: %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x bfloat> + return _mm_permutex2var_pbh(__A, __I, __B); +} + +__m256bh test_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { + // CHECK-LABEL: @test_mm256_permutex2var_pbh + // CHECK: %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x bfloat> + return _mm256_permutex2var_pbh(__A, __I, __B); +} + +__m128bh test_mm_permutexvar_pbh(__m128i __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_permutexvar_pbh + // CHECK: %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x bfloat> + return _mm_permutexvar_pbh(__A, __B); +} + +__m256bh test_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_permutexvar_pbh + // CHECK: %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x bfloat> + return _mm256_permutexvar_pbh(__A, __B); +} + +__m256bh test_mm256_addne_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_addne_pbh + // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_addne_pbh(__A, __B); +} + +__m256bh test_mm256_mask_addne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_addne_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_addne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_addne_pbh(__U, __A, __B); +} + +__m128bh test_mm_addne_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_addne_pbh + // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_addne_pbh(__A, __B); +} + +__m128bh test_mm_mask_addne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_addne_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_addne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_addne_pbh(__U, __A, __B); +} + +__m256bh test_mm256_subne_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_subne_pbh + // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_subne_pbh(__A, __B); +} + +__m256bh test_mm256_mask_subne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_subne_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_subne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_subne_pbh(__U, __A, __B); +} + +__m128bh test_mm_subne_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_subne_pbh + // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_subne_pbh(__A, __B); +} + +__m128bh test_mm_mask_subne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_subne_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_subne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_subne_pbh(__U, __A, __B); +} + +__m256bh test_mm256_mulne_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_mulne_pbh + // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_mulne_pbh(__A, __B); +} + +__m256bh test_mm256_mask_mulne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_mulne_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_mulne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_mulne_pbh(__U, __A, __B); +} + +__m128bh test_mm_mulne_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_mulne_pbh + // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_mulne_pbh(__A, __B); +} + +__m128bh test_mm_mask_mulne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_mulne_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_mulne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_mulne_pbh(__U, __A, __B); +} + +__m256bh test_mm256_divne_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_divne_pbh + // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_divne_pbh(__A, __B); +} + +__m256bh test_mm256_mask_divne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_divne_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_divne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_divne_pbh(__U, __A, __B); +} + +__m128bh test_mm_divne_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_divne_pbh + // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_divne_pbh(__A, __B); +} + +__m128bh test_mm_mask_divne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_divne_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_divne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_divne_pbh(__U, __A, __B); +} + +__m256bh test_mm256_max_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_max_pbh + // CHECK: @llvm.x86.avx10.vmaxpbf16256( + return _mm256_max_pbh(__A, __B); +} + +__m256bh test_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16256 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_max_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16256 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_max_pbh(__U, __A, __B); +} + +__m128bh test_mm_max_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_max_pbh + // CHECK: @llvm.x86.avx10.vmaxpbf16128( + return _mm_max_pbh(__A, __B); +} + +__m128bh test_mm_mask_max_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16128 + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_max_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_max_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: @llvm.x86.avx10.vmaxpbf16128 + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_max_pbh(__U, __A, __B); +} + +__m256bh test_mm256_min_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_min_pbh + // CHECK: @llvm.x86.avx10.vminpbf16256( + return _mm256_min_pbh(__A, __B); +} + +__m256bh test_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16256 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_min_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16256 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_min_pbh(__U, __A, __B); +} + +__m128bh test_mm_min_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_min_pbh + // CHECK: @llvm.x86.avx10.vminpbf16128( + return _mm_min_pbh(__A, __B); +} + +__m128bh test_mm_mask_min_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16128 + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_min_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_min_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) { + // CHECK: @llvm.x86.avx10.vminpbf16128 + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_min_pbh(__U, __A, __B); +} + +int test_mm_comeqsbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comeqsbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comeqsbh(__A, __B); +} + +int test_mm_comltsbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comltsbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comltsbh(__A, __B); +} + +int test_mm_comlesbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comlesbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comlesbh(__A, __B); +} + +int test_mm_comgtsbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comgtsbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16gt(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comgtsbh(__A, __B); +} + +int test_mm_comgesbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comgesbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comgesbh(__A, __B); +} + +int test_mm_comneqsbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: test_mm_comneqsbh + // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}}) + return _mm_comneqsbh(__A, __B); +} + +__mmask16 test_mm256_cmp_pbh_mask_eq_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: @test_mm256_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_lt_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_le_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_le_os + // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_unord_q(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_cmp_pbh_mask_neq_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_nlt_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_nle_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_ord_q(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_cmp_pbh_mask_eq_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_nge_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_ngt_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_false_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_false_oq + // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_neq_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_ge_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_gt_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_true_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_true_uq + // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_eq_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_lt_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_le_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_unord_s(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_cmp_pbh_mask_neq_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_us + // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_nlt_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_nle_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_ord_s(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_cmp_pbh_mask_eq_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_cmp_pbh_mask_nge_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_ngt_uq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_false_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_false_os + // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_neq_os(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_os + // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_cmp_pbh_mask_ge_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_gt_oq(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_cmp_pbh_mask_true_us(__m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_cmp_pbh_mask_true_us + // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}} + return _mm256_cmp_pbh_mask(a, b, _CMP_TRUE_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_eq_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: @test_mm256_mask_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_lt_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_le_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_le_os + // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_unord_q(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_neq_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nlt_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nle_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ord_q(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_eq_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nge_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ngt_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_false_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_false_oq + // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_neq_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ge_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_gt_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_true_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_true_uq + // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_eq_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_lt_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_le_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_unord_s(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_neq_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_us + // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nlt_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nle_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ord_s(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_eq_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_nge_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ngt_uq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_false_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_false_os + // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_neq_os(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_os + // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_ge_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_gt_oq(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_mask_cmp_pbh_mask_true_us(__mmask16 m, __m256bh a, __m256bh b) { + // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_true_us + // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> %{{.*}}, %{{.*}} + return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_cmp_pbh_mask_eq_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: @test_mm_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_lt_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_le_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_le_os + // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_unord_q(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_cmp_pbh_mask_neq_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_nlt_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_cmp_pbh_mask_nle_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_cmp_pbh_mask_ord_q(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_cmp_pbh_mask_eq_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_nge_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_cmp_pbh_mask_ngt_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_cmp_pbh_mask_false_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_false_oq + // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_neq_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_ge_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_gt_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_true_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_true_uq + // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_eq_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_lt_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_le_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_unord_s(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_cmp_pbh_mask_neq_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_us + // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_cmp_pbh_mask_nlt_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_nle_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_ord_s(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_cmp_pbh_mask_eq_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_cmp_pbh_mask_nge_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_ngt_uq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_cmp_pbh_mask_false_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_false_os + // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_neq_os(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_os + // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_cmp_pbh_mask_ge_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_gt_oq(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_cmp_pbh_mask_true_us(__m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_cmp_pbh_mask_true_us + // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}} + return _mm_cmp_pbh_mask(a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_eq_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: @test_mm_mask_cmp_pbh_mask_eq_oq + // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_lt_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_lt_os + // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_le_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_le_os + // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_unord_q(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_unord_q + // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_neq_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_uq + // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nlt_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nlt_us + // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nle_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nle_us + // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ord_q(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ord_q + // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_eq_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_uq + // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nge_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nge_us + // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ngt_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ngt_us + // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_false_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_false_oq + // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_neq_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_oq + // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ge_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ge_os + // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_gt_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_gt_os + // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_true_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_true_uq + // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_eq_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_os + // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_lt_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_lt_oq + // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_le_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_le_oq + // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_unord_s(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_unord_s + // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_neq_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_us + // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nlt_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nlt_uq + // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nle_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nle_uq + // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ord_s(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ord_s + // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_eq_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_us + // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_nge_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nge_uq + // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ngt_uq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ngt_uq + // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_false_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_false_os + // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_neq_os(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_os + // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_ge_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ge_oq + // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_gt_oq(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_gt_oq + // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_mask_cmp_pbh_mask_true_us(__mmask8 m, __m128bh a, __m128bh b) { + // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_true_us + // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> %{{.*}}, %{{.*}} + return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US); +} + + +__mmask16 test_mm256_mask_fpclass_pbh_mask(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.256 + return _mm256_mask_fpclass_pbh_mask(__U, __A, 4); +} + +__mmask16 test_mm256_fpclass_pbh_mask(__m256bh __A) { + // CHECK-LABEL: @test_mm256_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.256 + return _mm256_fpclass_pbh_mask(__A, 4); +} + +__mmask8 test_mm_mask_fpclass_pbh_mask(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.128 + return _mm_mask_fpclass_pbh_mask(__U, __A, 4); +} + +__mmask8 test_mm_fpclass_pbh_mask(__m128bh __A) { + // CHECK-LABEL: @test_mm_fpclass_pbh_mask + // CHECK: @llvm.x86.avx10.fpclass.nepbf16.128 + return _mm_fpclass_pbh_mask(__A, 4); +} + +__m256bh test_mm256_scalef_pbh(__m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256 + return _mm256_scalef_pbh(__A, __B); +} + +__m256bh test_mm256_mask_scalef_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_mask_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256 + return _mm256_mask_scalef_pbh(__W, __U, __A, __B); +} + +__m256bh test_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + // CHECK-LABEL: @test_mm256_maskz_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256 + return _mm256_maskz_scalef_pbh(__U, __A, __B); +} + +__m256bh test_mm256_rcp_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256 + return _mm256_rcp_pbh(__A); +} + +__m256bh test_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256 + return (__m256bh)_mm256_mask_rcp_pbh(__W, __U, __A); +} + +__m256bh test_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256 + return _mm256_maskz_rcp_pbh(__U, __A); +} + +__m256bh test_mm256_getexp_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256 + return _mm256_getexp_pbh(__A); +} + +__m256bh test_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256 + return _mm256_mask_getexp_pbh(__W, __U, __A); +} + +__m256bh test_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256 + return _mm256_maskz_getexp_pbh(__U, __A); +} + +__m256bh test_mm256_rsqrt_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256 + return _mm256_rsqrt_pbh(__A); +} + +__m256bh test_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256 + return (__m256bh)_mm256_mask_rsqrt_pbh(__W, __U, __A); +} + +__m256bh test_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256 + return _mm256_maskz_rsqrt_pbh(__U, __A); +} + +__m256bh test_mm256_reducene_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256 + return _mm256_reducene_pbh(__A, 3); +} + +__m256bh test_mm256_mask_reducene_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256 + return _mm256_mask_reducene_pbh(__W, __U, __A, 1); +} + +__m256bh test_mm256_maskz_reducene_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256 + return _mm256_maskz_reducene_pbh(__U, __A, 1); +} + +__m256bh test_mm256_roundscalene_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256 + return _mm256_roundscalene_pbh(__A, 3); +} + +__m256bh test_mm256_mask_roundscalene_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256 + return _mm256_mask_roundscalene_pbh(__W, __U, __A, 1); +} + +__m256bh test_mm256_maskz_roundscalene_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256 + return _mm256_maskz_roundscalene_pbh(__U, __A, 1 ); +} + +__m256bh test_mm256_getmant_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256 + return _mm256_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256bh test_mm256_mask_getmant_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256 + return _mm256_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256bh test_mm256_maskz_getmant_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256 + return _mm256_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256bh test_mm256_sqrt_pbh(__m256bh __A) { + // CHECK-LABEL: @test_mm256_sqrt_pbh + // CHECK: call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %{{.*}}) + return _mm256_sqrt_pbh(__A); +} + +__m256bh test_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_mask_sqrt_pbh + // CHECK: @llvm.sqrt.v16bf16 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return (__m256bh)_mm256_mask_sqrt_pbh(__W, __U, __A); +} + +__m256bh test_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) { + // CHECK-LABEL: @test_mm256_maskz_sqrt_pbh + // CHECK: @llvm.sqrt.v16bf16 + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_sqrt_pbh(__U, __A); +} + +__m128bh test_mm_scalef_pbh(__m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128 + return _mm_scalef_pbh(__A, __B); +} + +__m128bh test_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_mask_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128 + return _mm_mask_scalef_pbh(__W, __U, __A, __B); +} + +__m128bh test_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + // CHECK-LABEL: @test_mm_maskz_scalef_pbh + // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128 + return _mm_maskz_scalef_pbh(__U, __A, __B); +} + +__m128bh test_mm_rcp_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128 + return _mm_rcp_pbh(__A); +} + +__m128bh test_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128 + return (__m128bh)_mm_mask_rcp_pbh(__W, __U, __A); +} + +__m128bh test_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_rcp_pbh + // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128 + return _mm_maskz_rcp_pbh(__U, __A); +} + +__m128bh test_mm_getexp_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128 + return _mm_getexp_pbh(__A); +} + +__m128bh test_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128 + return _mm_mask_getexp_pbh(__W, __U, __A); +} + +__m128bh test_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_getexp_pbh + // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128 + return _mm_maskz_getexp_pbh(__U, __A); +} + +__m128bh test_mm_rsqrt_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128 + return _mm_rsqrt_pbh(__A); +} + +__m128bh test_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128 + return (__m128bh)_mm_mask_rsqrt_pbh(__W, __U, __A); +} + +__m128bh test_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_rsqrt_pbh + // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128 + return _mm_maskz_rsqrt_pbh(__U, __A); +} + +__m128bh test_mm_reducene_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128 + return _mm_reducene_pbh(__A, 3); +} + +__m128bh test_mm_mask_reducene_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128 + return _mm_mask_reducene_pbh(__W, __U, __A, 1); +} + +__m128bh test_mm_maskz_reducene_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_reducene_pbh + // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128 + return _mm_maskz_reducene_pbh(__U, __A, 1); +} + +__m128bh test_mm_roundscalene_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128 + return _mm_roundscalene_pbh(__A, 3); +} + +__m128bh test_mm_mask_roundscalene_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128 + return _mm_mask_roundscalene_pbh(__W, __U, __A, 1); +} + +__m128bh test_mm_maskz_roundscalene_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_roundscalene_pbh + // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128 + return _mm_maskz_roundscalene_pbh(__U, __A, 1 ); +} + +__m128bh test_mm_getmant_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128 + return _mm_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128bh test_mm_mask_getmant_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128 + return _mm_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128bh test_mm_maskz_getmant_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_getmant_pbh + // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128 + return _mm_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128bh test_mm_sqrt_pbh(__m128bh __A) { + // CHECK-LABEL: @test_mm_sqrt_pbh + // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}}) + return _mm_sqrt_pbh(__A); +} + +__m128bh test_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_mask_sqrt_pbh + // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return (__m128bh)_mm_mask_sqrt_pbh(__W, __U, __A); +} + +__m128bh test_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { + // CHECK-LABEL: @test_mm_maskz_sqrt_pbh + // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_sqrt_pbh(__U, __A); +} + +__m256bh test_mm256_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_fmaddne_pbh + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + return _mm256_fmaddne_pbh(__A, __B, __C); +} + +__m256bh test_mm256_mask_fmaddne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_mask_fmaddne_pbh + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask_fmaddne_pbh(__A, __U, __B, __C); +} + +__m256bh test_mm256_mask3_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmaddne_pbh + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask3_fmaddne_pbh(__A, __B, __C, __U); +} + +__m256bh test_mm256_maskz_fmaddne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_maskz_fmaddne_pbh + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_fmaddne_pbh(__U, __A, __B, __C); +} + +__m256bh test_mm256_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_fmsubne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + return _mm256_fmsubne_pbh(__A, __B, __C); +} + +__m256bh test_mm256_mask_fmsubne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_mask_fmsubne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask_fmsubne_pbh(__A, __U, __B, __C); +} + +__m256bh test_mm256_mask3_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmsubne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask3_fmsubne_pbh(__A, __B, __C, __U); +} + +__m256bh test_mm256_maskz_fmsubne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_maskz_fmsubne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_fmsubne_pbh(__U, __A, __B, __C); +} + +__m256bh test_mm256_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + return _mm256_fnmaddne_pbh(__A, __B, __C); +} + +__m256bh test_mm256_mask_fnmaddne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_mask_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask_fnmaddne_pbh(__A, __U, __B, __C); +} + +__m256bh test_mm256_mask3_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask3_fnmaddne_pbh(__A, __B, __C, __U); +} + +__m256bh test_mm256_maskz_fnmaddne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_maskz_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_fnmaddne_pbh(__U, __A, __B, __C); +} + +__m256bh test_mm256_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + return _mm256_fnmsubne_pbh(__A, __B, __C); +} + +__m256bh test_mm256_mask_fnmsubne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_mask_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask_fnmsubne_pbh(__A, __U, __B, __C); +} + +__m256bh test_mm256_mask3_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_mask3_fnmsubne_pbh(__A, __B, __C, __U); +} + +__m256bh test_mm256_maskz_fnmsubne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + // CHECK-LABEL: @test_mm256_maskz_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}) + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + return _mm256_maskz_fnmsubne_pbh(__U, __A, __B, __C); +} + +__m128bh test_mm_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_fmaddne_pbh + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_fmaddne_pbh(__A, __B, __C); +} + +__m128bh test_mm_mask_fmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_mask_fmaddne_pbh + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask_fmaddne_pbh(__A, __U, __B, __C); +} + +__m128bh test_mm_mask3_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmaddne_pbh + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask3_fmaddne_pbh(__A, __B, __C, __U); +} + +__m128bh test_mm_maskz_fmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_maskz_fmaddne_pbh + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_fmaddne_pbh(__U, __A, __B, __C); +} + +__m128bh test_mm_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_fmsubne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_fmsubne_pbh(__A, __B, __C); +} + +__m128bh test_mm_mask_fmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_mask_fmsubne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask_fmsubne_pbh(__A, __U, __B, __C); +} + +__m128bh test_mm_mask3_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmsubne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask3_fmsubne_pbh(__A, __B, __C, __U); +} + +__m128bh test_mm_maskz_fmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_maskz_fmsubne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_fmsubne_pbh(__U, __A, __B, __C); +} + +__m128bh test_mm_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_fnmaddne_pbh(__A, __B, __C); +} + +__m128bh test_mm_mask_fnmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_mask_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask_fnmaddne_pbh(__A, __U, __B, __C); +} + +__m128bh test_mm_mask3_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask3_fnmaddne_pbh(__A, __B, __C, __U); +} + +__m128bh test_mm_maskz_fnmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_maskz_fnmaddne_pbh + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_fnmaddne_pbh(__U, __A, __B, __C); +} + +__m128bh test_mm_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + return _mm_fnmsubne_pbh(__A, __B, __C); +} + +__m128bh test_mm_mask_fnmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_mask_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask_fnmsubne_pbh(__A, __U, __B, __C); +} + +__m128bh test_mm_mask3_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_mask3_fnmsubne_pbh(__A, __B, __C, __U); +} + +__m128bh test_mm_maskz_fnmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + // CHECK-LABEL: @test_mm_maskz_fnmsubne_pbh + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}) + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + return _mm_maskz_fnmsubne_pbh(__U, __A, __B, __C); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 8d000ed1e4f85..fafa5051bfb1b 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -7219,3 +7219,256 @@ def int_x86_avx10_mask_vcvtneph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2 DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; } + +//===----------------------------------------------------------------------===// +let TargetPrefix = "x86" in { +def int_x86_avx10_vaddnepbf16512 : ClangBuiltin<"__builtin_ia32_vaddnepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vaddnepbf16256 : ClangBuiltin<"__builtin_ia32_vaddnepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vaddnepbf16128 : ClangBuiltin<"__builtin_ia32_vaddnepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vsubnepbf16512 : ClangBuiltin<"__builtin_ia32_vsubnepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vsubnepbf16256 : ClangBuiltin<"__builtin_ia32_vsubnepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vsubnepbf16128 : ClangBuiltin<"__builtin_ia32_vsubnepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmulnepbf16512 : ClangBuiltin<"__builtin_ia32_vmulnepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmulnepbf16256 : ClangBuiltin<"__builtin_ia32_vmulnepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmulnepbf16128 : ClangBuiltin<"__builtin_ia32_vmulnepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vdivnepbf16512 : ClangBuiltin<"__builtin_ia32_vdivnepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vdivnepbf16256 : ClangBuiltin<"__builtin_ia32_vdivnepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vdivnepbf16128 : ClangBuiltin<"__builtin_ia32_vdivnepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmaxpbf16512 : ClangBuiltin<"__builtin_ia32_vmaxpbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmaxpbf16256 : ClangBuiltin<"__builtin_ia32_vmaxpbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vmaxpbf16128 : ClangBuiltin<"__builtin_ia32_vmaxpbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vminpbf16512 : ClangBuiltin<"__builtin_ia32_vminpbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vminpbf16256 : ClangBuiltin<"__builtin_ia32_vminpbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vminpbf16128 : ClangBuiltin<"__builtin_ia32_vminpbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16eq : ClangBuiltin<"__builtin_ia32_vcomsbf16eq">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16lt : ClangBuiltin<"__builtin_ia32_vcomsbf16lt">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty,llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16le : ClangBuiltin<"__builtin_ia32_vcomsbf16le">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16gt : ClangBuiltin<"__builtin_ia32_vcomsbf16gt">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16ge : ClangBuiltin<"__builtin_ia32_vcomsbf16ge">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcomsbf16neq : ClangBuiltin<"__builtin_ia32_vcomsbf16neq">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rsqrt_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rsqrt_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rsqrt_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rcp_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrcppbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rcp_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrcppbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_rcp_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrcppbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_reduce_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vreducenepbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_reduce_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vreducenepbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_reduce_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vreducenepbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_fpclass_nepbf16_128 : + DefaultAttrsIntrinsic<[llvm_v8i1_ty], [llvm_v8bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_fpclass_nepbf16_256 : + DefaultAttrsIntrinsic<[llvm_v16i1_ty], [llvm_v16bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_fpclass_nepbf16_512 : + DefaultAttrsIntrinsic<[llvm_v32i1_ty], [llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_getexp_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vgetexppbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_getexp_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vgetexppbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_getexp_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vgetexppbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_getmant_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_getmant_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_getmant_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_rndscale_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_rndscale_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_rndscale_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_scalef_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vscalefpbf16128_mask">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_scalef_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vscalefpbf16256_mask">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_scalef_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vscalefpbf16512_mask">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd213nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd213nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfmadd132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfmadd231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmadd231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfmsub213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfmsub132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfmsub231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfmsub231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmadd231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16512">, + DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16256">, + DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty], + [IntrNoMem]>; +def int_x86_avx10_vfnmsub231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16128">, + DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ], + [IntrNoMem]>; +} diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 2b6b0ad16bcf7..03f49306c2b7b 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3305,11 +3305,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, if ((PatchedName.starts_with("cmp") || PatchedName.starts_with("vcmp")) && (PatchedName.ends_with("ss") || PatchedName.ends_with("sd") || PatchedName.ends_with("sh") || PatchedName.ends_with("ph") || - PatchedName.ends_with("ps") || PatchedName.ends_with("pd"))) { + PatchedName.ends_with("pbf16") || PatchedName.ends_with("ps") || + PatchedName.ends_with("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; unsigned CCIdx = IsVCMP ? 4 : 3; + unsigned suffixLength = PatchedName.ends_with("pbf16") ? 5 : 2; unsigned CC = StringSwitch( - PatchedName.slice(CCIdx, PatchedName.size() - 2)) + PatchedName.slice(CCIdx, PatchedName.size() - suffixLength)) .Case("eq", 0x00) .Case("eq_oq", 0x00) .Case("lt", 0x01) @@ -3372,6 +3374,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName = "vcmpsh"; else if (PatchedName.ends_with("ph")) PatchedName = "vcmpph"; + else if (PatchedName.ends_with("pbf16")) + PatchedName = "vcmppbf16"; else llvm_unreachable("Unexpected suffix!"); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index 33104524c5a89..8fcc1c10d93a0 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -167,6 +167,15 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk: + case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: + case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri: + case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri: + case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik: + case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik: + case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik: + case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik: + case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik: + case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik: if (Imm >= 0 && Imm <= 31) { OS << '\t'; printCMPMnemonic(MI, /*IsVCMP*/true, OS); @@ -205,7 +214,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, printwordmem(MI, CurOp--, OS); else printdwordmem(MI, CurOp--, OS); - } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) { + } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD && + (Desc.TSFlags & X86II::OpMapMask) != X86II::TA) { assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA && "Unexpected op map!"); printqwordmem(MI, CurOp--, OS); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index ad1f2dc532d1c..e7ba13215feb5 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -309,6 +309,17 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk: OS << "sh\t"; break; + case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: + case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri: + case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri: + case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik: + case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik: + case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik: + case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik: + case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik: + case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik: + OS << "pbf16\t"; + break; } } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index 7c8459a546516..39600ffcadd8e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -146,6 +146,15 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk: + case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: + case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri: + case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri: + case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik: + case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik: + case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik: + case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik: + case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik: + case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik: if (Imm >= 0 && Imm <= 31) { OS << '\t'; printCMPMnemonic(MI, /*IsVCMP*/true, OS); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f7878c78a5231..451881e1d6141 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2360,6 +2360,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom); } + if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) { + addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass); + addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass); + addRegisterClass(MVT::v32bf16, &X86::VR512RegClass); + + setOperationAction(ISD::FADD, MVT::v32bf16, Legal); + setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); + setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); + setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); + setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); + setOperationAction(ISD::FMA, MVT::v32bf16, Legal); + setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); + if (Subtarget.hasVLX()) { + for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { + setOperationAction(ISD::FADD, VT, Legal); + setOperationAction(ISD::FSUB, VT, Legal); + setOperationAction(ISD::FMUL, VT, Legal); + setOperationAction(ISD::FDIV, VT, Legal); + setOperationAction(ISD::FSQRT, VT, Legal); + setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); + } + } + } + if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); @@ -12212,7 +12237,8 @@ static bool isShuffleFoldableLoad(SDValue V) { template static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { T EltVT = VT.getScalarType(); - return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16()); + return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) || + (EltVT == MVT::f16 && !Subtarget.hasFP16()); } /// Try to lower insertion of a single element into a zero vector. @@ -23265,7 +23291,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, if (isFP) { MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); - assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); + assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 || + EltVT == MVT::f64); if (isSoftF16(EltVT, Subtarget)) return SDValue(); @@ -23282,7 +23309,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Op0.getSimpleValueType().is512BitVector())) { #ifndef NDEBUG unsigned Num = VT.getVectorNumElements(); - assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16)); + assert(Num <= 16 || + (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16))); #endif Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { @@ -54159,7 +54187,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, EVT ScalarVT = VT.getScalarType(); if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) && - !(ScalarVT == MVT::f16 && Subtarget.hasFP16())) + !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) && + !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2())) return SDValue(); auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index a518347cfcd82..b0eb210b687b1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -910,3 +910,318 @@ multiclass avx10_convert_2op_nomb, AVX512XDIi8Base, T_MAP5, EVEX, EVEX_CD8<16, CD8VH>; + +//------------------------------------------------- +// AVX10 BF16 instructions +//------------------------------------------------- + +// VADDNEPBF16, VSUBNEPBF16, VMULNEPBF16, VDIVNEPBF16, VMAXPBF16, VMINPBF16 +multiclass avx10_fp_binopne_int_pbf16 opc, string OpcodeStr, + X86SchedWriteSizes sched, + bit IsCommutable = 0> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fp_packed("int_x86_avx10_"#OpcodeStr#"pbf16512"), + !cast("int_x86_avx10_"#OpcodeStr#"pbf16512"), + v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fp_packed("int_x86_avx10_"#OpcodeStr#"pbf16128"), + !cast("int_x86_avx10_"#OpcodeStr#"pbf16128"), + v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_fp_packed("int_x86_avx10_"#OpcodeStr#"pbf16256"), + !cast("int_x86_avx10_"#OpcodeStr#"pbf16256"), + v16bf16x_info, sched.PH.YMM, IsCommutable>, EVEX_V256, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + } +} + +multiclass avx10_fp_binop_pbf16 opc, string OpcodeStr, SDPatternOperator OpNode, + X86SchedWriteSizes sched, + bit IsCommutable = 0, + SDPatternOperator MaskOpNode = OpNode> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fp_packed, EVEX_V512, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fp_packed, EVEX_V128, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_fp_packed, EVEX_V256, + T_MAP5, PD, EVEX_CD8<16, CD8VF>; + } +} + +let Uses = [], mayRaiseFPException = 0 in { +defm VADDNEPBF16 : avx10_fp_binop_pbf16<0x58, "vaddne", fadd, SchedWriteFAddSizes, 1>; +defm VSUBNEPBF16 : avx10_fp_binop_pbf16<0x5C, "vsubne", fsub, SchedWriteFAddSizes, 0>; +defm VMULNEPBF16 : avx10_fp_binop_pbf16<0x59, "vmulne", fmul, SchedWriteFMulSizes, 1>; +defm VDIVNEPBF16 : avx10_fp_binop_pbf16<0x5E, "vdivne", fdiv, SchedWriteFDivSizes, 0>; +defm VMINPBF16 : avx10_fp_binopne_int_pbf16<0x5D, "vmin", SchedWriteFCmpSizes, 0>; +defm VMAXPBF16 : avx10_fp_binopne_int_pbf16<0x5F, "vmax", SchedWriteFCmpSizes, 0>; +} + +// VCOMSBF16 +let Uses = [], mayRaiseFPException = 0, + Defs = [EFLAGS], Predicates = [HasAVX10_2] in { + //TODO: Replace null_frag with X86fcmp to support lowering `fcmp oeq bfloat *` + //which may require extend supports on BFR16X, loadbf16, ... + defm VCOMSBF16Z : sse12_ord_cmp<0x2F, FR16X, null_frag, bf16, f16mem, loadf16, + "comsbf16", SSEPackedSingle>, T_MAP5, PD, EVEX, + VEX_LIG, EVEX_CD8<16, CD8VT1>; + + let isCodeGenOnly = 1 in { + defm VCOMSBF16Z : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8bf16, f16mem, + sse_load_bf16, "comsbf16", SSEPackedSingle>, + T_MAP5, PD, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>; + } +} + +// VCMPPBF16 +multiclass avx10_vcmp_common_bf16 { + let mayRaiseFPException = 0 in { + defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + 1>, Sched<[sched]>; + + defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + timm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + timm:$cc)>, + Sched<[sched.Folded, sched.ReadAfterFold]>; + + defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $cc", + (X86cmpm (_.VT _.RC:$src1), + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc)>, + EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } +} + +multiclass avx10_vcmp_bf16 { + let Predicates = [HasAVX10_2_512] in + defm Z : avx10_vcmp_common_bf16, EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx10_vcmp_common_bf16, EVEX_V128; + defm Z256 : avx10_vcmp_common_bf16, EVEX_V256; + } +} + +defm VCMPPBF16 : avx10_vcmp_bf16, + AVX512XDIi8Base, EVEX, VVVV, + EVEX_CD8<16, CD8VF>, TA; + + +// VSQRTNEPBF16 +multiclass avx10_sqrt_packed_bf16 opc, string OpcodeStr, + X86SchedWriteSizes sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_sqrt_packed, + EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_sqrt_packed, + EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_sqrt_packed, + EVEX_V256, PD, T_MAP5, EVEX_CD8<16, CD8VF>; + } +} + +let Uses = [], mayRaiseFPException = 0 in +defm VSQRTNEPBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrtne", SchedWriteFSqrtSizes>; + +// VRSQRTPBF16, VRCPPBF16, VSRQTPBF16, VGETEXPPBF16 +multiclass avx10_fp14_pbf16 opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm PBF16Z : avx512_fp14_p, + EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm PBF16Z128 : avx512_fp14_p, + EVEX_V128; + defm PBF16Z256 : avx512_fp14_p, + EVEX_V256; + } +} + +defm VRSQRT : avx10_fp14_pbf16<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>, + T_MAP6, PS, EVEX_CD8<16, CD8VF>; +defm VRCP : avx10_fp14_pbf16<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>, + T_MAP6, PS, EVEX_CD8<16, CD8VF>; +defm VGETEXP : avx10_fp14_pbf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, + T_MAP5, EVEX_CD8<16, CD8VF>; + +// VSCALEFPBF16 +multiclass avx10_fp_scalef_bf16 opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fp_scalef_p, + EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fp_scalef_p, + EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS; + defm Z256 : avx512_fp_scalef_p, + EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PS; + } +} + +let Uses = [], mayRaiseFPException = 0 in +defm VSCALEFPBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>; + +// VREDUCENEPBF16, VRNDSCALENEPBF16, VGETMANTPBF16 +multiclass avx10_common_unary_fp_packed_imm_bf16 opc, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_unary_fp_packed_imm, EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_unary_fp_packed_imm, EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm, EVEX_V256; + } +} + +let Uses = [], mayRaiseFPException = 0 in { +defm VREDUCENEPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vreducene", avx512vl_bf16_info, 0x56, + X86VReduce, X86VReduce, SchedWriteFRnd>, + AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>; +defm VRNDSCALENEPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vrndscalene", avx512vl_bf16_info, 0x08, + X86any_VRndScale, X86VRndScale, SchedWriteFRnd>, + AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>; +defm VGETMANTPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_bf16_info, 0x26, + X86VGetMant, X86VGetMant, SchedWriteFRnd>, + AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>; +} + +// VFPCLASSPBF16 +multiclass avx10_fp_fpclass_bf16 opcVec, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_vector_fpclass>, EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_vector_fpclass>, EVEX_V128; + defm Z256 : avx512_vector_fpclass>, EVEX_V256; + } +} + +defm VFPCLASSPBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>, + AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>; + +// VF[,N]M[ADD,SUB][132,213,231]NEPBF16 +multiclass avx10_fma3p_213_bf16 opc, string OpcodeStr, + SDPatternOperator OpNode, SDNode MaskOpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fma3p_213_rm, EVEX_V512, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fma3p_213_rm, EVEX_V128, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_fma3p_213_rm, EVEX_V256, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + } +} + +let Uses = [], mayRaiseFPException = 0 in { +defm VFMADD213NEPBF16 : avx10_fma3p_213_bf16<0xA8, "vfmadd213nepbf16", any_fma, + fma, SchedWriteFMA>; +defm VFMSUB213NEPBF16 : avx10_fma3p_213_bf16<0xAA, "vfmsub213nepbf16", X86any_Fmsub, + X86Fmsub, SchedWriteFMA>; +defm VFNMADD213NEPBF16 : avx10_fma3p_213_bf16<0xAC, "vfnmadd213nepbf16", X86any_Fnmadd, + X86Fnmadd, SchedWriteFMA>; +defm VFNMSUB213NEPBF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213nepbf16", X86any_Fnmsub, + X86Fnmsub, SchedWriteFMA>; +} + +multiclass avx10_fma3p_231_bf16 opc, string OpcodeStr, + SDPatternOperator OpNode, SDNode MaskOpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fma3p_231_rm, EVEX_V512, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fma3p_231_rm, EVEX_V128, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_fma3p_231_rm, EVEX_V256, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + } +} + +let Uses = [], mayRaiseFPException = 0 in { +defm VFMADD231NEPBF16 : avx10_fma3p_231_bf16<0xB8, "vfmadd231nepbf16", any_fma, + fma, SchedWriteFMA>; +defm VFMSUB231NEPBF16 : avx10_fma3p_231_bf16<0xBA, "vfmsub231nepbf16", X86any_Fmsub, + X86Fmsub, SchedWriteFMA>; +defm VFNMADD231NEPBF16 : avx10_fma3p_231_bf16<0xBC, "vfnmadd231nepbf16", X86any_Fnmadd, + X86Fnmadd, SchedWriteFMA>; +defm VFNMSUB231NEPBF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231nepbf16", X86any_Fnmsub, + X86Fnmsub, SchedWriteFMA>; +} + +multiclass avx10_fma3p_132_bf16 opc, string OpcodeStr, + SDPatternOperator OpNode, SDNode MaskOpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx512_fma3p_132_rm, EVEX_V512, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx512_fma3p_132_rm, EVEX_V128, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + defm Z256 : avx512_fma3p_132_rm, EVEX_V256, T_MAP6, PS, + EVEX_CD8<16, CD8VF>; + } +} + +let Uses = [], mayRaiseFPException = 0 in { +defm VFMADD132NEPBF16 : avx10_fma3p_132_bf16<0x98, "vfmadd132nepbf16", any_fma, + fma, SchedWriteFMA>; +defm VFMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9A, "vfmsub132nepbf16", X86any_Fmsub, + X86Fmsub, SchedWriteFMA>; +defm VFNMADD132NEPBF16 : avx10_fma3p_132_bf16<0x9C, "vfnmadd132nepbf16", X86any_Fnmadd, + X86Fnmadd, SchedWriteFMA>; +defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_Fnmsub, + X86Fnmsub, SchedWriteFMA>; +} diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 88d1eb5986243..c988524213123 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2495,8 +2495,8 @@ multiclass avx512_scalar_fpclass opc, string OpcodeStr, // fpclass(reg_vec, broadcast(eltVt), imm) multiclass avx512_vector_fpclass opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, - string mem>{ - let ExeDomain = _.ExeDomain, Uses = [MXCSR] in { + string mem, list _Uses = [MXCSR]>{ + let ExeDomain = _.ExeDomain, Uses = _Uses in { def rr : AVX512, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; + def X86MaskCmpMaskCC : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, @@ -1139,6 +1140,10 @@ def X86SubVBroadcastld256 : PatFrag<(ops node:$src), // only load a single element. // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing // the simple_load case. +def sse_load_bf16 : PatFrags<(ops node:$ptr), + [(v8bf16 (simple_load node:$ptr)), + (v8bf16 (X86vzload16 node:$ptr)), + (v8bf16 (scalar_to_vector (loadf16 node:$ptr)))]>; def sse_load_f16 : PatFrags<(ops node:$ptr), [(v8f16 (simple_load node:$ptr)), (v8f16 (X86vzload16 node:$ptr)), diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 208af630a352d..531268b41da96 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -313,7 +313,7 @@ def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">; -def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbh">; +def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbf16">; def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; @@ -323,7 +323,7 @@ def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">; -def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbh">; +def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbf16">; def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; @@ -332,7 +332,7 @@ def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; def v8f16x_info : X86VectorVTInfo<8, f16, VR128X, "ph">; -def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbh">; +def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbf16">; def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 68c1ce072549b..4f39e66e22c23 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -389,6 +389,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_128, INTR_TYPE_2OP, + X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_256, INTR_TYPE_2OP, + X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_512, INTR_TYPE_2OP, + X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_128, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_256, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_512, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_128, INTR_TYPE_2OP_MASK, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_256, INTR_TYPE_2OP_MASK, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_512, INTR_TYPE_2OP_MASK, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_128, INTR_TYPE_1OP_MASK, + X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_256, INTR_TYPE_1OP_MASK, + X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_512, INTR_TYPE_1OP_MASK, + X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_128, INTR_TYPE_2OP_MASK, + X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_256, INTR_TYPE_2OP_MASK, + X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_512, INTR_TYPE_2OP_MASK, + X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_128, INTR_TYPE_2OP_MASK, + X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_256, INTR_TYPE_2OP_MASK, + X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_512, INTR_TYPE_2OP_MASK, + X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_128, INTR_TYPE_1OP_MASK, + X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_256, INTR_TYPE_1OP_MASK, + X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_512, INTR_TYPE_1OP_MASK, + X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_128, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_256, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx10_mask_vcmppd256, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), X86_INTRINSIC_DATA(avx10_mask_vcmpph256, CMP_MASK_CC, X86ISD::CMPMM, @@ -655,6 +703,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx10_vcomsbf16eq, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(avx10_vcomsbf16ge, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(avx10_vcomsbf16gt, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(avx10_vcomsbf16le, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(avx10_vcomsbf16lt, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(avx10_vcomsbf16neq, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8128, INTR_TYPE_2OP, X86ISD::VCVTNE2PH2BF8, 0), X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8256, INTR_TYPE_2OP, diff --git a/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll new file mode 100644 index 0000000000000..c41e03ba637cb --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll @@ -0,0 +1,1244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx10.2-512 | FileCheck %s + +define <8 x bfloat> @fma_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) { +; CHECK-LABEL: fma_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + ret <8 x bfloat> %a +} + +define <8 x bfloat> @fma_mask_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_mask_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <8 x bfloat> @fma_maskz_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x bfloat>, ptr %zp + %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %c +} + +define <16 x bfloat> @fma_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) { +; CHECK-LABEL: fma_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + ret <16 x bfloat> %a +} + +define <16 x bfloat> @fma_mask_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_mask_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <16 x bfloat> @fma_maskz_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x bfloat>, ptr %zp + %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %c +} + +define <32 x bfloat> @fma_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) { +; CHECK-LABEL: fma_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) { +; CHECK-LABEL: fma_load_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + ret <32 x bfloat> %a +} + +define <32 x bfloat> @fma_mask_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_mask_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} + +define <32 x bfloat> @fma_maskz_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x bfloat>, ptr %zp + %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %c +} diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll new file mode 100644 index 0000000000000..33c40ac6bb32c --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -0,0 +1,587 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 + +define <32 x bfloat> @test_int_x86_avx10_vaddnepbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vaddnepbf16512: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fadd <32 x bfloat> %x1, %x2 + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x58,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x58,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = fadd <32 x bfloat> %x1, %x2 + %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xc2] +; X64-NEXT: vaddnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x0e] +; X64-NEXT: vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xc2] +; X86-NEXT: vaddnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x08] +; X86-NEXT: vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x bfloat>, ptr %ptr + %res0 = fadd <32 x bfloat> %x1, %x2 + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + %t2 = fadd <32 x bfloat> %x1, %val + %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer + %res3 = fadd <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} + +define <32 x bfloat> @test_int_x86_avx10_sub_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fsub <32 x bfloat> %x1, %x2 + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5c,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5c,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = fsub <32 x bfloat> %x1, %x2 + %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] +; X64-NEXT: vsubnepbf16 (%rsi), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x0e] +; X64-NEXT: vsubnepbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] +; X86-NEXT: vsubnepbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] +; X86-NEXT: vsubnepbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x bfloat>, ptr %ptr + %res0 = fsub <32 x bfloat> %x1, %x2 + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + %t2 = fsub <32 x bfloat> %x1, %val + %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer + %res3 = fsub <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} + +declare <32 x bfloat> @llvm.x86.avx10.vmulnepbf16512(<32 x bfloat>, <32 x bfloat>) + +define <32 x bfloat> @test_int_x86_avx10_mul_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fmul <32 x bfloat> %x1, %x2 + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x59,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x59,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = fmul <32 x bfloat> %x1, %x2 + %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xc2] +; X64-NEXT: vmulnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x0e] +; X64-NEXT: vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xc2] +; X86-NEXT: vmulnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x08] +; X86-NEXT: vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x bfloat>, ptr %ptr + %res0 = fmul <32 x bfloat> %x1, %x2 + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + %t2 = fmul <32 x bfloat> %x1, %val + %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer + %res3 = fmul <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} + +define <32 x bfloat> @test_int_x86_avx10_div_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fdiv <32 x bfloat> %x1, %x2 + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5e,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5e,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = fdiv <32 x bfloat> %x1, %x2 + %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src + ret <32 x bfloat> %res +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xc2] +; X64-NEXT: vdivnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x0e] +; X64-NEXT: vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xc2] +; X86-NEXT: vdivnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x08] +; X86-NEXT: vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x bfloat>, ptr %ptr + %res0 = fdiv <32 x bfloat> %x1, %x2 + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + %t2 = fdiv <32 x bfloat> %x1, %val + %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer + %res3 = fdiv <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} + +define i32 @test_int_x86_avx10_vcmppbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpunordpbf16 %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x48,0xc2,0xc1,0x03] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp uno <32 x bfloat> %x1, %x2 + %res = bitcast <32 x i1> %1 to i32 + ret i32 %res +} + +; FIXME: _mm512_mask_cmp_p[s|h]_mask is not using {k2} but gcc does +define i32 @test_int_x86_avx10_vcmppbf16512_mask2(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16512_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpeqpbf16 %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x48,0xc2,0xc1,0x00] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: andl $3, %eax # encoding: [0x83,0xe0,0x03] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp oeq <32 x bfloat> %x1, %x2 + %2 = and <32 x i1> %1, + %3 = bitcast <32 x i1> %2 to i32 + ret i32 %3 +} + +define <32 x bfloat> @test_sqrt_nepbf16_512(<32 x bfloat> %a0) { +; CHECK-LABEL: test_sqrt_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtnepbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %a0) + ret <32 x bfloat> %1 +} + +define <32 x bfloat> @test_mm512_mask_sqrt_pbh(<32 x bfloat> %__W, i32 %__U, <32 x bfloat> %__A) { +; X64-LABEL: test_mm512_mask_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x51,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x51,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %__A) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__W + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_maskz_sqrt_pbh(i32 %__U, <32 x bfloat>%__A) { +; X64-LABEL: test_mm512_maskz_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x51,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x51,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %__A) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_fmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; CHECK-LABEL: test_mm512_fmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) + ret <32 x bfloat> %0 +} + +define <32 x bfloat> @test_mm512_mask_fmaddne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_mask_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_mask3_fmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) { +; X64-LABEL: test_mm512_mask3_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xb8,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask3_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xb8,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_maskz_fmaddne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_maskz_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_fmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; CHECK-LABEL: test_mm512_fmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xaa,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i) + ret <32 x bfloat> %0 +} + +define <32 x bfloat> @test_mm512_mask_fmsubne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_mask_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_mask3_fmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) { +; X64-LABEL: test_mm512_mask3_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xba,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask3_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xba,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_maskz_fmsubne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_maskz_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xaa,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xaa,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_fnmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; CHECK-LABEL: test_mm512_fnmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <32 x bfloat> %__B + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i, <32 x bfloat> %__C) + ret <32 x bfloat> %0 +} + +define <32 x bfloat> @test_mm512_mask_fnmaddne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_mask_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_mask3_fnmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) { +; X64-LABEL: test_mm512_mask3_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbc,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask3_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbc,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_maskz_fnmaddne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_maskz_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xac,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xac,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_fnmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; CHECK-LABEL: test_mm512_fnmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <32 x bfloat> %__B + %fneg1.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i, <32 x bfloat> %fneg1.i) + ret <32 x bfloat> %0 +} + +define <32 x bfloat> @test_mm512_mask_fnmsubne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_mask_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %fneg1.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_mask3_fnmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) { +; X64-LABEL: test_mm512_mask3_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbe,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask3_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbe,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %fneg1.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C + ret <32 x bfloat> %2 +} + +define <32 x bfloat> @test_mm512_maskz_fnmsubne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) { +; X64-LABEL: test_mm512_maskz_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xae,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xae,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <32 x bfloat> %__B + %fneg1.i.i = fneg <32 x bfloat> %__C + %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i) + %1 = bitcast i32 %__U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %2 +} diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll new file mode 100644 index 0000000000000..7b81d547db085 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 + +declare <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat>, <32 x bfloat>) + +define <32 x bfloat> @test_int_x86_avx10_min_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vminpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) + ret <32 x bfloat> %res0 +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vminpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %res1 +} + +declare <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat>, <32 x bfloat>) + +define <32 x bfloat> @test_int_x86_avx10_max_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) + ret <32 x bfloat> %res0 +} + +define <32 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmaxpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) + %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %res1 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32) + +define <32 x bfloat> @test_rsqrt_nepbf16_512(<32 x bfloat> %a0) { +; CHECK-LABEL: test_rsqrt_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.512(<32 x bfloat> %a0, <32 x bfloat> zeroinitializer, i32 -1) + ret <32 x bfloat> %res +} + +declare <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat>, i32) + +define i32 @test_int_x86_avx512_fpclass_nepbf16_512(<32 x bfloat> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasspbf16 $2, %zmm0, %k1 # encoding: [0x62,0xf3,0x7f,0x48,0x66,0xc8,0x02] +; CHECK-NEXT: vfpclasspbf16 $4, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x66,0xc0,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat> %x0, i32 4) + %res1 = call <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat> %x0, i32 2) + %1 = and <32 x i1> %res1, %res + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32) + +define <32 x bfloat> @test_rcp_nepbf16_512(<32 x bfloat> %a0, <32 x bfloat> %a1, i32 %mask) { +; X64-LABEL: test_rcp_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrcppbf16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x4c,0xc8] +; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_rcp_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrcppbf16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x4c,0xc8] +; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <32 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.512(<32 x bfloat> %a0, <32 x bfloat> %a1, i32 %mask) + ret <32 x bfloat> %res +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32) + +define <32 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vreducenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x56,0xc8,0x08] +; X64-NEXT: vreducenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x56,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vreducenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x56,0xc8,0x08] +; X86-NEXT: vreducenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x56,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3) + %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1) + %res2 = fadd <32 x bfloat> %res, %res1 + ret <32 x bfloat> %res2 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32) + +define <32 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrndscalenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x08,0xc8,0x08] +; X64-NEXT: vrndscalenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x08,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrndscalenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x08,0xc8,0x08] +; X86-NEXT: vrndscalenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x08,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3) + %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1) + %res2 = fadd <32 x bfloat> %res, %res1 + ret <32 x bfloat> %res2 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32) + +define <32 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x1, i32 %x2) { +; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetexppbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0] +; X64-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8] +; X64-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetexppbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0] +; X86-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8] +; X86-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, i32 %x2) + %res2 = call <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> zeroinitializer, i32 -1) + %res3 = fadd <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32) + +define <32 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetmantpbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x26,0xc8,0x08] +; X64-NEXT: vgetmantpbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x26,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetmantpbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x26,0xc8,0x08] +; X86-NEXT: vgetmantpbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x26,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3) + %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1) + %res2 = fadd <32 x bfloat> %res, %res1 + ret <32 x bfloat> %res2 +} + +declare <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>, i32) + +define <32 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vscalefpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x2c,0xc1] +; X64-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd0] +; X64-NEXT: vaddnepbf16 %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf5,0x6d,0x48,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vscalefpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x2c,0xc1] +; X86-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd0] +; X86-NEXT: vaddnepbf16 %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf5,0x6d,0x48,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i32 %x3 to <32 x i1> + %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %x3) + %res2 = call <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> zeroinitializer, i32 -1) + %res3 = fadd <32 x bfloat> %res1, %res2 + ret <32 x bfloat> %res3 +} diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll new file mode 100644 index 0000000000000..e0f5679e8ac96 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -0,0 +1,1168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 + +define <16 x bfloat> @test_int_x86_avx10_add_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_add_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fadd <16 x bfloat> %x1, %x2 + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x58,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x58,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = fadd <16 x bfloat> %x1, %x2 + %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src + ret <16 x bfloat> %res +} +define <16 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xc2] +; X64-NEXT: vaddnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x0e] +; X64-NEXT: vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xc2] +; X86-NEXT: vaddnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x08] +; X86-NEXT: vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %val = load <16 x bfloat>, ptr %ptr + %res0 = fadd <16 x bfloat> %x1, %x2 + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + %t2 = fadd <16 x bfloat> %x1, %val + %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer + %res3 = fadd <16 x bfloat> %res1, %res2 + ret <16 x bfloat> %res3 +} + +define <8 x bfloat> @test_int_x86_avx10_add_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_add_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fadd <8 x bfloat> %x1, %x2 + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x58,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x58,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = fadd <8 x bfloat> %x1, %x2 + %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xc2] +; X64-NEXT: vaddnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x0e] +; X64-NEXT: vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xc2] +; X86-NEXT: vaddnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x08] +; X86-NEXT: vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %val = load <8 x bfloat>, ptr %ptr + %res0 = fadd <8 x bfloat> %x1, %x2 + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + %t2 = fadd <8 x bfloat> %x1, %val + %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer + %res3 = fadd <8 x bfloat> %res1, %res2 + ret <8 x bfloat> %res3 +} + +define <16 x bfloat> @test_int_x86_avx10_sub_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fsub <16 x bfloat> %x1, %x2 + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5c,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5c,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = fsub <16 x bfloat> %x1, %x2 + %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] +; X64-NEXT: vsubnepbf16 (%rsi), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x0e] +; X64-NEXT: vsubnepbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] +; X86-NEXT: vsubnepbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] +; X86-NEXT: vsubnepbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %val = load <16 x bfloat>, ptr %ptr + %res0 = fsub <16 x bfloat> %x1, %x2 + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + %t2 = fsub <16 x bfloat> %x1, %val + %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer + %res3 = fsub <16 x bfloat> %res1, %res2 + ret <16 x bfloat> %res3 +} + +define <8 x bfloat> @test_int_x86_avx10_sub_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fsub <8 x bfloat> %x1, %x2 + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5c,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5c,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = fsub <8 x bfloat> %x1, %x2 + %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] +; X64-NEXT: vsubnepbf16 (%rsi), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x0e] +; X64-NEXT: vsubnepbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] +; X86-NEXT: vsubnepbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] +; X86-NEXT: vsubnepbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %val = load <8 x bfloat>, ptr %ptr + %res0 = fsub <8 x bfloat> %x1, %x2 + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + %t2 = fsub <8 x bfloat> %x1, %val + %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer + %res3 = fsub <8 x bfloat> %res1, %res2 + ret <8 x bfloat> %res3 +} + +define <16 x bfloat> @test_int_x86_avx10_mul_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fmul <16 x bfloat> %x1, %x2 + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x59,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x59,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = fmul <16 x bfloat> %x1, %x2 + %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xc2] +; X64-NEXT: vmulnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x0e] +; X64-NEXT: vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xc2] +; X86-NEXT: vmulnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x08] +; X86-NEXT: vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %val = load <16 x bfloat>, ptr %ptr + %res0 = fmul <16 x bfloat> %x1, %x2 + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + %t2 = fmul <16 x bfloat> %x1, %val + %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer + %res3 = fmul <16 x bfloat> %res1, %res2 + ret <16 x bfloat> %res3 +} + +define <8 x bfloat> @test_int_x86_avx10_mul_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fmul <8 x bfloat> %x1, %x2 + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x59,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x59,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = fmul <8 x bfloat> %x1, %x2 + %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xc2] +; X64-NEXT: vmulnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x0e] +; X64-NEXT: vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xc2] +; X86-NEXT: vmulnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x08] +; X86-NEXT: vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %val = load <8 x bfloat>, ptr %ptr + %res0 = fmul <8 x bfloat> %x1, %x2 + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + %t2 = fmul <8 x bfloat> %x1, %val + %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer + %res3 = fmul <8 x bfloat> %res1, %res2 + ret <8 x bfloat> %res3 +} + +define <16 x bfloat> @test_int_x86_avx10_div_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fdiv <16 x bfloat> %x1, %x2 + ret <16 x bfloat> %res +} + +define <16 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5e,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5e,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = fdiv <16 x bfloat> %x1, %x2 + %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src + ret <16 x bfloat> %res +} + +; FIXME: assembly order is different from fp16 ones +define <16 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xc2] +; X64-NEXT: vdivnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x0e] +; X64-NEXT: vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xc2] +; X86-NEXT: vdivnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x08] +; X86-NEXT: vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %val = load <16 x bfloat>, ptr %ptr + %res0 = fdiv <16 x bfloat> %x1, %x2 + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + %t2 = fdiv <16 x bfloat> %x1, %val + %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer + %res3 = fdiv <16 x bfloat> %res1, %res2 + ret <16 x bfloat> %res3 +} + +define <8 x bfloat> @test_int_x86_avx10_div_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = fdiv <8 x bfloat> %x1, %x2 + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5e,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5e,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = fdiv <8 x bfloat> %x1, %x2 + %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src + ret <8 x bfloat> %res +} + +; FIXME: assembly order is different from fp16 ones +define <8 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) { +; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xc2] +; X64-NEXT: vdivnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x0e] +; X64-NEXT: vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xc2] +; X86-NEXT: vdivnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x08] +; X86-NEXT: vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %val = load <8 x bfloat>, ptr %ptr + %res0 = fdiv <8 x bfloat> %x1, %x2 + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + %t2 = fdiv <8 x bfloat> %x1, %val + %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer + %res3 = fdiv <8 x bfloat> %res1, %res2 + ret <8 x bfloat> %res3 +} + +define i16 @test_int_x86_avx10_vcmppbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpunordpbf16 %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7f,0x28,0xc2,0xc1,0x03] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp uno <16 x bfloat> %x1, %x2 + %res = bitcast <16 x i1> %1 to i16 + ret i16 %res +} + +define i16 @test_int_x86_avx10_vcmppbf16256_mask2(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16256_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpeqpbf16 %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7f,0x28,0xc2,0xc1,0x00] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: andl $3, %eax # encoding: [0x83,0xe0,0x03] +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp oeq <16 x bfloat> %x1, %x2 + %2 = and <16 x i1> %1, + %3 = bitcast <16 x i1> %2 to i16 + ret i16 %3 +} + +define i8 @test_int_x86_avx10_vcmppbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpunordpbf16 %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x08,0xc2,0xc1,0x03] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp uno <8 x bfloat> %x1, %x2 + %res = bitcast <8 x i1> %1 to i8 + ret i8 %res +} + +define i8 @test_int_x86_avx10_vcmppbf16128_mask2(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_vcmppbf16128_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpeqpbf16 %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x08,0xc2,0xc1,0x00] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: andb $3, %al # encoding: [0x24,0x03] +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fcmp oeq <8 x bfloat> %x1, %x2 + %2 = and <8 x i1> %1, + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} + +define <16 x bfloat> @test_sqrt_nepbf16_256(<16 x bfloat> %a0) { +; CHECK-LABEL: test_sqrt_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtnepbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %a0) + ret <16 x bfloat> %1 +} + +define <16 x bfloat> @test_mm256_mask_sqrt_pbh(<16 x bfloat> %__W, i16 %__U, <16 x bfloat> %__A) { +; X64-LABEL: test_mm256_mask_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x51,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x51,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %__A) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__W + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_maskz_sqrt_pbh(i16 %__U, <16 x bfloat>%__A) { +; X64-LABEL: test_mm256_maskz_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x51,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x51,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %__A) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %2 +} + +define <8 x bfloat> @test_sqrt_nepbf16_128(<8 x bfloat> %a0) { +; CHECK-LABEL: test_sqrt_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtnepbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a0) + ret <8 x bfloat> %1 +} + +define <8 x bfloat> @test_mm_mask_sqrt_pbh(<8 x bfloat> %__W, i8 %__U, <8 x bfloat> %__A) { +; X64-LABEL: test_mm_mask_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x51,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x51,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %__A) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__W + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_maskz_sqrt_pbh(i8 %__U, <8 x bfloat>%__A) { +; X64-LABEL: test_mm_maskz_sqrt_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vsqrtnepbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x51,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_sqrt_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtnepbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x51,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %__A) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_fmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; CHECK-LABEL: test_mm256_fmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) + ret <16 x bfloat> %0 +} + +define <16 x bfloat> @test_mm256_mask_fmaddne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_mask_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_mask3_fmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) { +; X64-LABEL: test_mm256_mask3_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xb8,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask3_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xb8,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_maskz_fmaddne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_maskz_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_fmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; CHECK-LABEL: test_mm256_fmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xaa,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i) + ret <16 x bfloat> %0 +} + +define <16 x bfloat> @test_mm256_mask_fmsubne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_mask_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_mask3_fmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) { +; X64-LABEL: test_mm256_mask3_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xba,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask3_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xba,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_maskz_fmsubne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_maskz_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xaa,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xaa,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_fnmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; CHECK-LABEL: test_mm256_fnmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <16 x bfloat> %__B + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i, <16 x bfloat> %__C) + ret <16 x bfloat> %0 +} + +define <16 x bfloat> @test_mm256_mask_fnmaddne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_mask_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_mask3_fnmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) { +; X64-LABEL: test_mm256_mask3_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbc,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask3_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbc,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_maskz_fnmaddne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_maskz_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xac,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xac,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_fnmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; CHECK-LABEL: test_mm256_fnmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <16 x bfloat> %__B + %fneg1.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i, <16 x bfloat> %fneg1.i) + ret <16 x bfloat> %0 +} + +define <16 x bfloat> @test_mm256_mask_fnmsubne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_mask_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %fneg1.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_mask3_fnmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) { +; X64-LABEL: test_mm256_mask3_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbe,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask3_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbe,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %fneg1.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C + ret <16 x bfloat> %2 +} + +define <16 x bfloat> @test_mm256_maskz_fnmsubne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) { +; X64-LABEL: test_mm256_maskz_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xae,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xae,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <16 x bfloat> %__B + %fneg1.i.i = fneg <16 x bfloat> %__C + %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_fmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; CHECK-LABEL: test_mm_fmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) + ret <8 x bfloat> %0 +} + +define <8 x bfloat> @test_mm_mask_fmaddne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_mask_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_mask3_fmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) { +; X64-LABEL: test_mm_mask3_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xb8,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask3_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xb8,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_maskz_fmaddne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_maskz_fmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_fmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_fmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; CHECK-LABEL: test_mm_fmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xaa,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i) + ret <8 x bfloat> %0 +} + +define <8 x bfloat> @test_mm_mask_fmsubne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_mask_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_mask3_fmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) { +; X64-LABEL: test_mm_mask3_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xba,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask3_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xba,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_maskz_fmsubne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_maskz_fmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xaa,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_fmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xaa,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_fnmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; CHECK-LABEL: test_mm_fnmaddne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <8 x bfloat> %__B + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i, <8 x bfloat> %__C) + ret <8 x bfloat> %0 +} + +define <8 x bfloat> @test_mm_mask_fnmaddne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_mask_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_mask3_fnmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) { +; X64-LABEL: test_mm_mask3_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbc,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask3_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbc,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_maskz_fnmaddne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_maskz_fnmaddne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xac,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_fnmaddne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xac,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_fnmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; CHECK-LABEL: test_mm_fnmsubne_pbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %fneg.i = fneg <8 x bfloat> %__B + %fneg1.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i, <8 x bfloat> %fneg1.i) + ret <8 x bfloat> %0 +} + +define <8 x bfloat> @test_mm_mask_fnmsubne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_mask_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %fneg1.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_mask3_fnmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) { +; X64-LABEL: test_mm_mask3_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbe,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask3_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbe,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %fneg1.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C + ret <8 x bfloat> %2 +} + +define <8 x bfloat> @test_mm_maskz_fnmsubne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) { +; X64-LABEL: test_mm_maskz_fnmsubne_pbh: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xae,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_fnmsubne_pbh: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xae,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %fneg.i.i = fneg <8 x bfloat> %__B + %fneg1.i.i = fneg <8 x bfloat> %__C + %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %2 +} diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll new file mode 100644 index 0000000000000..559d866b55cc7 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll @@ -0,0 +1,602 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 + +declare <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat>, <16 x bfloat>) + +define <16 x bfloat> @test_int_x86_avx10_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vminpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) + ret <16 x bfloat> %res0 +} + +define <16 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %res1 +} + +declare <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat>, <8 x bfloat>) + +define <8 x bfloat> @test_int_x86_avx10_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vminpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) + ret <8 x bfloat> %res0 +} + +define <8 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %res1 +} + +declare <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat>, <16 x bfloat>) + +define <16 x bfloat> @test_int_x86_avx10_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) + ret <16 x bfloat> %res0 +} + +define <16 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %msk to <16 x i1> + %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) + %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %res1 +} + +declare <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat>, <8 x bfloat>) + +define <8 x bfloat> @test_int_x86_avx10_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) { +; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) + ret <8 x bfloat> %res0 +} + +define <8 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) { +; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %msk to <8 x i1> + %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) + %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %res1 +} + +declare i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat>, <8 x bfloat>) +declare i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat>, <8 x bfloat>) +declare i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat>, <8 x bfloat>) +declare i32 @llvm.x86.avx10.vcomsbf16gt(<8 x bfloat>, <8 x bfloat>) +declare i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat>, <8 x bfloat>) +declare i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat>, <8 x bfloat>) + +define i32 @test_x86_avx10_com_nesbf16_eq(<8 x bfloat> %a0, <8 x bfloat> %a1) { +; CHECK-LABEL: test_x86_avx10_com_nesbf16_eq: +; CHECK: # %bb.0: +; CHECK-NEXT: vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1] +; CHECK-NEXT: setnp %al # encoding: [0x0f,0x9b,0xc0] +; CHECK-NEXT: sete %cl # encoding: [0x0f,0x94,0xc1] +; CHECK-NEXT: andb %al, %cl # encoding: [0x20,0xc1] +; CHECK-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat> %a0, <8 x bfloat> %a1) + ret i32 %res +} + +define i32 @test_x86_avx10_com_nesbf16_lt(<8 x bfloat> %a0, <8 x bfloat> %a1) { +; CHECK-LABEL: test_x86_avx10_com_nesbf16_lt: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8] +; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat> %a0, <8 x bfloat> %a1) + ret i32 %res +} + +define i32 @test_x86_avx10_com_nesbf16_le(<8 x bfloat> %a0, <8 x bfloat> %a1) { +; CHECK-LABEL: test_x86_avx10_com_nesbf16_le: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8] +; CHECK-NEXT: setae %al # encoding: [0x0f,0x93,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat> %a0, <8 x bfloat> %a1) + ret i32 %res +} + +define i32 @test_x86_avx10_com_nesbf16_gt(<8 x bfloat> %a0, <8 x bfloat> %a1) { +; CHECK-LABEL: test_x86_avx10_com_nesbf16_gt: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1] +; CHECK-NEXT: setae %al # encoding: [0x0f,0x93,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat> %a0, <8 x bfloat> %a1) + ret i32 %res +} + +define i32 @test_x86_avx10_com_nesbf16_neq(<8 x bfloat> %a0, <8 x bfloat> %a1) { +; CHECK-LABEL: test_x86_avx10_com_nesbf16_neq: +; CHECK: # %bb.0: +; CHECK-NEXT: vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1] +; CHECK-NEXT: setp %al # encoding: [0x0f,0x9a,0xc0] +; CHECK-NEXT: setne %cl # encoding: [0x0f,0x95,0xc1] +; CHECK-NEXT: orb %al, %cl # encoding: [0x08,0xc1] +; CHECK-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat> %a0, <8 x bfloat> %a1) + ret i32 %res +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16) + +define <8 x bfloat> @test_rsqrt_nepbf16_128(<8 x bfloat> %a0) { +; CHECK-LABEL: test_rsqrt_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtpbf16 %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> zeroinitializer, i8 -1) + ret <8 x bfloat> %res +} + +define <16 x bfloat> @test_rsqrt_nepbf16_256(<16 x bfloat> %a0) { +; CHECK-LABEL: test_rsqrt_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtpbf16 %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> zeroinitializer, i16 -1) + ret <16 x bfloat> %res +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16) + +define <8 x bfloat> @test_rcp_nepbf16_128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask) { +; X64-LABEL: test_rcp_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_rcp_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask) + ret <8 x bfloat> %res +} + +define <16 x bfloat> @test_rcp_nepbf16_256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask) { +; X64-LABEL: test_rcp_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_rcp_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask) + ret <16 x bfloat> %res +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16) + +define <8 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08] +; X64-NEXT: vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08] +; X86-NEXT: vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3) + %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1) + %res2 = fadd <8 x bfloat> %res, %res1 + ret <8 x bfloat> %res2 +} + +define <16 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08] +; X64-NEXT: vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08] +; X86-NEXT: vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3) + %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1) + %res2 = fadd <16 x bfloat> %res, %res1 + ret <16 x bfloat> %res2 +} + +declare <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat>, i32) +declare <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat>, i32) + +define i8 @test_int_x86_avx512_fpclass_nepbf16_128(<8 x bfloat> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasspbf16 $2, %xmm0, %k1 # encoding: [0x62,0xf3,0x7f,0x08,0x66,0xc8,0x02] +; CHECK-NEXT: vfpclasspbf16 $4, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x66,0xc0,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 4) + %res1 = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 2) + %1 = and <8 x i1> %res1, %res + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define i16 @test_int_x86_avx512_fpclass_nepbf16_256(<16 x bfloat> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasspbf16 $2, %ymm0, %k1 # encoding: [0x62,0xf3,0x7f,0x28,0x66,0xc8,0x02] +; CHECK-NEXT: vfpclasspbf16 $4, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x66,0xc0,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 4) + %res1 = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 2) + %1 = and <16 x i1> %res1, %res + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16) + +define <8 x bfloat>@test_int_x86_avx512_getexp_nepbf16_128(<8 x bfloat> %x0) { +; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexppbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x42,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 -1) + ret <8 x bfloat> %res +} + +define <8 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2) + ret <8 x bfloat> %res +} + +define <8 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_128(<8 x bfloat> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 %x2) + ret <8 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_getexp_nepbf16_256(<16 x bfloat> %x0) { +; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexppbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x42,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 -1) + ret <16 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2) { +; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2) + ret <16 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_256(<16 x bfloat> %x0, i16 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 %x2) + ret <16 x bfloat> %res +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16) + +define <8 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08] +; X64-NEXT: vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08] +; X86-NEXT: vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3) + %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1) + %res2 = fadd <8 x bfloat> %res, %res1 + ret <8 x bfloat> %res2 +} + +define <16 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08] +; X64-NEXT: vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08] +; X86-NEXT: vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3) + %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1) + %res2 = fadd <16 x bfloat> %res, %res1 + ret <16 x bfloat> %res2 +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16) + +define <8 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08] +; X64-NEXT: vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08] +; X86-NEXT: vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3) + %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1) + %res2 = fadd <8 x bfloat> %res, %res1 + ret <8 x bfloat> %res2 +} + +define <16 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08] +; X64-NEXT: vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04] +; X64-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08] +; X86-NEXT: vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04] +; X86-NEXT: vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3) + %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1) + %res2 = fadd <16 x bfloat> %res, %res1 + ret <16 x bfloat> %res2 +} + +declare <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8) +declare <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>, i16) + +define <8 x bfloat>@test_int_x86_avx512_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1) { +; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x2c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 -1) + ret <8 x bfloat> %res +} + +define <8 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %x3 to <8 x i1> + %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3) + ret <8 x bfloat> %res +} + +define <8 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i8 %x3 to <8 x i1> + %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 %x3) + ret <8 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1) { +; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x2c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 -1) + ret <16 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %x3 to <16 x i1> + %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3) + ret <16 x bfloat> %res +} + +define <16 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %mask = bitcast i16 %x3 to <16 x i1> + %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 %x3) + ret <16 x bfloat> %res +} diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt new file mode 100644 index 0000000000000..8cc53db077e4f --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt @@ -0,0 +1,3015 @@ +# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vaddnepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vaddnepbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x58,0xd4 + +# ATT: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vaddnepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x58,0xd4 + +# ATT: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x58,0xd4 + +# ATT: vaddnepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vaddnepbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x58,0xd4 + +# ATT: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vaddnepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x58,0xd4 + +# ATT: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x58,0xd4 + +# ATT: vaddnepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vaddnepbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x58,0xd4 + +# ATT: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vaddnepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x58,0xd4 + +# ATT: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x58,0xd4 + +# ATT: vaddnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x58,0x10 + +# ATT: vaddnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vaddnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f + +# ATT: vaddnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x58,0x52,0x80 + +# ATT: vaddnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x58,0x10 + +# ATT: vaddnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vaddnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f + +# ATT: vaddnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x58,0x52,0x80 + +# ATT: vaddnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x58,0x10 + +# ATT: vaddnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vaddnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f + +# ATT: vaddnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x58,0x52,0x80 + +# ATT: vcmppbf16 $123, %ymm4, %ymm3, %k5 +# INTEL: vcmppbf16 k5, ymm3, ymm4, 123 +0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm3, ymm4, 123 +0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, %xmm4, %xmm3, %k5 +# INTEL: vcmppbf16 k5, xmm3, xmm4, 123 +0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm3, xmm4, 123 +0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, %zmm4, %zmm3, %k5 +# INTEL: vcmppbf16 k5, zmm3, zmm4, 123 +0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm3, zmm4, 123 +0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b + +# ATT: vcmppbf16 $123, 268435456(%esp,%esi,8), %zmm3, %k5 +# INTEL: vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%eax){1to32}, %zmm3, %k5 +# INTEL: vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123 +0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b + +# ATT: vcmppbf16 $123, -2048(,%ebp,2), %zmm3, %k5 +# INTEL: vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123 +0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 8128(%ecx), %zmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123 +0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%edx){1to32}, %zmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123 +0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b + +# ATT: vcmppbf16 $123, 268435456(%esp,%esi,8), %xmm3, %k5 +# INTEL: vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%eax){1to8}, %xmm3, %k5 +# INTEL: vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123 +0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b + +# ATT: vcmppbf16 $123, -512(,%ebp,2), %xmm3, %k5 +# INTEL: vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123 +0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 2032(%ecx), %xmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%edx){1to8}, %xmm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b + +# ATT: vcmppbf16 $123, 268435456(%esp,%esi,8), %ymm3, %k5 +# INTEL: vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%eax){1to16}, %ymm3, %k5 +# INTEL: vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123 +0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b + +# ATT: vcmppbf16 $123, -1024(,%ebp,2), %ymm3, %k5 +# INTEL: vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123 +0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 4064(%ecx), %ymm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%edx){1to16}, %ymm3, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b + +# ATT: vcomsbf16 %xmm3, %xmm2 +# INTEL: vcomsbf16 xmm2, xmm3 +0x62,0xf5,0x7d,0x08,0x2f,0xd3 + +# ATT: vcomsbf16 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcomsbf16 291(%edi,%eax,4), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcomsbf16 (%eax), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [eax] +0x62,0xf5,0x7d,0x08,0x2f,0x10 + +# ATT: vcomsbf16 -64(,%ebp,2), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [2*ebp - 64] +0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vcomsbf16 254(%ecx), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [ecx + 254] +0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f + +# ATT: vcomsbf16 -256(%edx), %xmm2 +# INTEL: vcomsbf16 xmm2, word ptr [edx - 256] +0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80 + +# ATT: vdivnepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vdivnepbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x5e,0xd4 + +# ATT: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vdivnepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x5e,0xd4 + +# ATT: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x5e,0xd4 + +# ATT: vdivnepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vdivnepbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x5e,0xd4 + +# ATT: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vdivnepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x5e,0xd4 + +# ATT: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x5e,0xd4 + +# ATT: vdivnepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vdivnepbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x5e,0xd4 + +# ATT: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vdivnepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x5e,0xd4 + +# ATT: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x5e,0xd4 + +# ATT: vdivnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x5e,0x10 + +# ATT: vdivnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vdivnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f + +# ATT: vdivnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80 + +# ATT: vdivnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x5e,0x10 + +# ATT: vdivnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vdivnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f + +# ATT: vdivnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80 + +# ATT: vdivnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x5e,0x10 + +# ATT: vdivnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vdivnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f + +# ATT: vdivnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80 + +# ATT: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0x98,0xd4 + +# ATT: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0x98,0xd4 + +# ATT: vfmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0x98,0x10 + +# ATT: vfmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f + +# ATT: vfmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0x98,0x52,0x80 + +# ATT: vfmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0x98,0x10 + +# ATT: vfmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f + +# ATT: vfmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0x98,0x52,0x80 + +# ATT: vfmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0x98,0x10 + +# ATT: vfmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f + +# ATT: vfmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0x98,0x52,0x80 + +# ATT: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xa8,0xd4 + +# ATT: vfmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xa8,0x10 + +# ATT: vfmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f + +# ATT: vfmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80 + +# ATT: vfmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xa8,0x10 + +# ATT: vfmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f + +# ATT: vfmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80 + +# ATT: vfmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xa8,0x10 + +# ATT: vfmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f + +# ATT: vfmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80 + +# ATT: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xb8,0xd4 + +# ATT: vfmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xb8,0x10 + +# ATT: vfmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f + +# ATT: vfmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80 + +# ATT: vfmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xb8,0x10 + +# ATT: vfmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f + +# ATT: vfmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80 + +# ATT: vfmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xb8,0x10 + +# ATT: vfmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f + +# ATT: vfmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80 + +# ATT: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0x9a,0xd4 + +# ATT: vfmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0x9a,0x10 + +# ATT: vfmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f + +# ATT: vfmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80 + +# ATT: vfmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0x9a,0x10 + +# ATT: vfmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f + +# ATT: vfmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80 + +# ATT: vfmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0x9a,0x10 + +# ATT: vfmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f + +# ATT: vfmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80 + +# ATT: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xaa,0xd4 + +# ATT: vfmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xaa,0x10 + +# ATT: vfmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f + +# ATT: vfmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80 + +# ATT: vfmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xaa,0x10 + +# ATT: vfmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f + +# ATT: vfmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80 + +# ATT: vfmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xaa,0x10 + +# ATT: vfmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f + +# ATT: vfmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80 + +# ATT: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xba,0xd4 + +# ATT: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xba,0xd4 + +# ATT: vfmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xba,0x10 + +# ATT: vfmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f + +# ATT: vfmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xba,0x52,0x80 + +# ATT: vfmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xba,0x10 + +# ATT: vfmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f + +# ATT: vfmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xba,0x52,0x80 + +# ATT: vfmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xba,0x10 + +# ATT: vfmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f + +# ATT: vfmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xba,0x52,0x80 + +# ATT: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0x9c,0xd4 + +# ATT: vfnmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0x9c,0x10 + +# ATT: vfnmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f + +# ATT: vfnmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80 + +# ATT: vfnmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0x9c,0x10 + +# ATT: vfnmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f + +# ATT: vfnmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80 + +# ATT: vfnmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0x9c,0x10 + +# ATT: vfnmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f + +# ATT: vfnmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80 + +# ATT: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xac,0xd4 + +# ATT: vfnmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xac,0x10 + +# ATT: vfnmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f + +# ATT: vfnmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xac,0x52,0x80 + +# ATT: vfnmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xac,0x10 + +# ATT: vfnmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f + +# ATT: vfnmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xac,0x52,0x80 + +# ATT: vfnmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xac,0x10 + +# ATT: vfnmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f + +# ATT: vfnmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xac,0x52,0x80 + +# ATT: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xbc,0xd4 + +# ATT: vfnmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xbc,0x10 + +# ATT: vfnmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f + +# ATT: vfnmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80 + +# ATT: vfnmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xbc,0x10 + +# ATT: vfnmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f + +# ATT: vfnmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80 + +# ATT: vfnmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xbc,0x10 + +# ATT: vfnmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f + +# ATT: vfnmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80 + +# ATT: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0x9e,0xd4 + +# ATT: vfnmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0x9e,0x10 + +# ATT: vfnmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f + +# ATT: vfnmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80 + +# ATT: vfnmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0x9e,0x10 + +# ATT: vfnmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f + +# ATT: vfnmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80 + +# ATT: vfnmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0x9e,0x10 + +# ATT: vfnmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f + +# ATT: vfnmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80 + +# ATT: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xae,0xd4 + +# ATT: vfnmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xae,0x10 + +# ATT: vfnmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f + +# ATT: vfnmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xae,0x52,0x80 + +# ATT: vfnmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xae,0x10 + +# ATT: vfnmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f + +# ATT: vfnmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xae,0x52,0x80 + +# ATT: vfnmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xae,0x10 + +# ATT: vfnmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f + +# ATT: vfnmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xae,0x52,0x80 + +# ATT: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0xbe,0xd4 + +# ATT: vfnmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0xbe,0x10 + +# ATT: vfnmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f + +# ATT: vfnmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80 + +# ATT: vfnmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0xbe,0x10 + +# ATT: vfnmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f + +# ATT: vfnmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80 + +# ATT: vfnmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0xbe,0x10 + +# ATT: vfnmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f + +# ATT: vfnmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80 + +# ATT: vfpclasspbf16 $123, %zmm3, %k5 +# INTEL: vfpclasspbf16 k5, zmm3, 123 +0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16 $123, %zmm3, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, zmm3, 123 +0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16 $123, %ymm3, %k5 +# INTEL: vfpclasspbf16 k5, ymm3, 123 +0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16 $123, %ymm3, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, ymm3, 123 +0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16 $123, %xmm3, %k5 +# INTEL: vfpclasspbf16 k5, xmm3, 123 +0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16 $123, %xmm3, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmm3, 123 +0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b + +# ATT: vfpclasspbf16x $123, 268435456(%esp,%esi,8), %k5 +# INTEL: vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vfpclasspbf16x $123, 291(%edi,%eax,4), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vfpclasspbf16 $123, (%eax){1to8}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to8}, 123 +0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b + +# ATT: vfpclasspbf16x $123, -512(,%ebp,2), %k5 +# INTEL: vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123 +0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vfpclasspbf16x $123, 2032(%ecx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%edx){1to8}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclasspbf16 $123, (%eax){1to16}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to16}, 123 +0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b + +# ATT: vfpclasspbf16y $123, -1024(,%ebp,2), %k5 +# INTEL: vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123 +0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vfpclasspbf16y $123, 4064(%ecx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%edx){1to16}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclasspbf16 $123, (%eax){1to32}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to32}, 123 +0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b + +# ATT: vfpclasspbf16z $123, -2048(,%ebp,2), %k5 +# INTEL: vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123 +0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vfpclasspbf16z $123, 8128(%ecx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123 +0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%edx){1to32}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123 +0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b + +# ATT: vgetexppbf16 %xmm3, %xmm2 +# INTEL: vgetexppbf16 xmm2, xmm3 +0x62,0xf5,0x7d,0x08,0x42,0xd3 + +# ATT: vgetexppbf16 %xmm3, %xmm2 {%k7} +# INTEL: vgetexppbf16 xmm2 {k7}, xmm3 +0x62,0xf5,0x7d,0x0f,0x42,0xd3 + +# ATT: vgetexppbf16 %xmm3, %xmm2 {%k7} {z} +# INTEL: vgetexppbf16 xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7d,0x8f,0x42,0xd3 + +# ATT: vgetexppbf16 %zmm3, %zmm2 +# INTEL: vgetexppbf16 zmm2, zmm3 +0x62,0xf5,0x7d,0x48,0x42,0xd3 + +# ATT: vgetexppbf16 %zmm3, %zmm2 {%k7} +# INTEL: vgetexppbf16 zmm2 {k7}, zmm3 +0x62,0xf5,0x7d,0x4f,0x42,0xd3 + +# ATT: vgetexppbf16 %zmm3, %zmm2 {%k7} {z} +# INTEL: vgetexppbf16 zmm2 {k7} {z}, zmm3 +0x62,0xf5,0x7d,0xcf,0x42,0xd3 + +# ATT: vgetexppbf16 %ymm3, %ymm2 +# INTEL: vgetexppbf16 ymm2, ymm3 +0x62,0xf5,0x7d,0x28,0x42,0xd3 + +# ATT: vgetexppbf16 %ymm3, %ymm2 {%k7} +# INTEL: vgetexppbf16 ymm2 {k7}, ymm3 +0x62,0xf5,0x7d,0x2f,0x42,0xd3 + +# ATT: vgetexppbf16 %ymm3, %ymm2 {%k7} {z} +# INTEL: vgetexppbf16 ymm2 {k7} {z}, ymm3 +0x62,0xf5,0x7d,0xaf,0x42,0xd3 + +# ATT: vgetexppbf16 268435456(%esp,%esi,8), %xmm2 +# INTEL: vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%eax){1to8}, %xmm2 +# INTEL: vgetexppbf16 xmm2, word ptr [eax]{1to8} +0x62,0xf5,0x7d,0x18,0x42,0x10 + +# ATT: vgetexppbf16 -512(,%ebp,2), %xmm2 +# INTEL: vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vgetexppbf16 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f + +# ATT: vgetexppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80 + +# ATT: vgetexppbf16 268435456(%esp,%esi,8), %ymm2 +# INTEL: vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%eax){1to16}, %ymm2 +# INTEL: vgetexppbf16 ymm2, word ptr [eax]{1to16} +0x62,0xf5,0x7d,0x38,0x42,0x10 + +# ATT: vgetexppbf16 -1024(,%ebp,2), %ymm2 +# INTEL: vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vgetexppbf16 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f + +# ATT: vgetexppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80 + +# ATT: vgetexppbf16 268435456(%esp,%esi,8), %zmm2 +# INTEL: vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%eax){1to32}, %zmm2 +# INTEL: vgetexppbf16 zmm2, word ptr [eax]{1to32} +0x62,0xf5,0x7d,0x58,0x42,0x10 + +# ATT: vgetexppbf16 -2048(,%ebp,2), %zmm2 +# INTEL: vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vgetexppbf16 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f + +# ATT: vgetexppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80 + +# ATT: vgetmantpbf16 $123, %zmm3, %zmm2 +# INTEL: vgetmantpbf16 zmm2, zmm3, 123 +0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} +# INTEL: vgetmantpbf16 zmm2 {k7}, zmm3, 123 +0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z} +# INTEL: vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123 +0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %ymm3, %ymm2 +# INTEL: vgetmantpbf16 ymm2, ymm3, 123 +0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} +# INTEL: vgetmantpbf16 ymm2 {k7}, ymm3, 123 +0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z} +# INTEL: vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123 +0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %xmm3, %xmm2 +# INTEL: vgetmantpbf16 xmm2, xmm3, 123 +0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} +# INTEL: vgetmantpbf16 xmm2 {k7}, xmm3, 123 +0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z} +# INTEL: vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123 +0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %xmm2 +# INTEL: vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%eax){1to8}, %xmm2 +# INTEL: vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123 +0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b + +# ATT: vgetmantpbf16 $123, -512(,%ebp,2), %xmm2 +# INTEL: vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %ymm2 +# INTEL: vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%eax){1to16}, %ymm2 +# INTEL: vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123 +0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b + +# ATT: vgetmantpbf16 $123, -1024(,%ebp,2), %ymm2 +# INTEL: vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %zmm2 +# INTEL: vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%eax){1to32}, %zmm2 +# INTEL: vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123 +0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b + +# ATT: vgetmantpbf16 $123, -2048(,%ebp,2), %zmm2 +# INTEL: vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b + +# ATT: vmaxpbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vmaxpbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x5f,0xd4 + +# ATT: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vmaxpbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x5f,0xd4 + +# ATT: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x5f,0xd4 + +# ATT: vmaxpbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vmaxpbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x5f,0xd4 + +# ATT: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vmaxpbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x5f,0xd4 + +# ATT: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x5f,0xd4 + +# ATT: vmaxpbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vmaxpbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x5f,0xd4 + +# ATT: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vmaxpbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x5f,0xd4 + +# ATT: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x5f,0xd4 + +# ATT: vmaxpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x5f,0x10 + +# ATT: vmaxpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vmaxpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f + +# ATT: vmaxpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80 + +# ATT: vmaxpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x5f,0x10 + +# ATT: vmaxpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vmaxpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f + +# ATT: vmaxpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80 + +# ATT: vmaxpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x5f,0x10 + +# ATT: vmaxpbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vmaxpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f + +# ATT: vmaxpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80 + +# ATT: vminpbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vminpbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x5d,0xd4 + +# ATT: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vminpbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x5d,0xd4 + +# ATT: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x5d,0xd4 + +# ATT: vminpbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vminpbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x5d,0xd4 + +# ATT: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vminpbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x5d,0xd4 + +# ATT: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x5d,0xd4 + +# ATT: vminpbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vminpbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x5d,0xd4 + +# ATT: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vminpbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x5d,0xd4 + +# ATT: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x5d,0xd4 + +# ATT: vminpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vminpbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x5d,0x10 + +# ATT: vminpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vminpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f + +# ATT: vminpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80 + +# ATT: vminpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vminpbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x5d,0x10 + +# ATT: vminpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vminpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f + +# ATT: vminpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80 + +# ATT: vminpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vminpbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x5d,0x10 + +# ATT: vminpbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vminpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f + +# ATT: vminpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80 + +# ATT: vmulnepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vmulnepbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x59,0xd4 + +# ATT: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vmulnepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x59,0xd4 + +# ATT: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x59,0xd4 + +# ATT: vmulnepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vmulnepbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x59,0xd4 + +# ATT: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vmulnepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x59,0xd4 + +# ATT: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x59,0xd4 + +# ATT: vmulnepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vmulnepbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x59,0xd4 + +# ATT: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vmulnepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x59,0xd4 + +# ATT: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x59,0xd4 + +# ATT: vmulnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x59,0x10 + +# ATT: vmulnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vmulnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f + +# ATT: vmulnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x59,0x52,0x80 + +# ATT: vmulnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x59,0x10 + +# ATT: vmulnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vmulnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f + +# ATT: vmulnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x59,0x52,0x80 + +# ATT: vmulnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x59,0x10 + +# ATT: vmulnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vmulnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f + +# ATT: vmulnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x59,0x52,0x80 + +# ATT: vrcppbf16 %xmm3, %xmm2 +# INTEL: vrcppbf16 xmm2, xmm3 +0x62,0xf6,0x7c,0x08,0x4c,0xd3 + +# ATT: vrcppbf16 %xmm3, %xmm2 {%k7} +# INTEL: vrcppbf16 xmm2 {k7}, xmm3 +0x62,0xf6,0x7c,0x0f,0x4c,0xd3 + +# ATT: vrcppbf16 %xmm3, %xmm2 {%k7} {z} +# INTEL: vrcppbf16 xmm2 {k7} {z}, xmm3 +0x62,0xf6,0x7c,0x8f,0x4c,0xd3 + +# ATT: vrcppbf16 %zmm3, %zmm2 +# INTEL: vrcppbf16 zmm2, zmm3 +0x62,0xf6,0x7c,0x48,0x4c,0xd3 + +# ATT: vrcppbf16 %zmm3, %zmm2 {%k7} +# INTEL: vrcppbf16 zmm2 {k7}, zmm3 +0x62,0xf6,0x7c,0x4f,0x4c,0xd3 + +# ATT: vrcppbf16 %zmm3, %zmm2 {%k7} {z} +# INTEL: vrcppbf16 zmm2 {k7} {z}, zmm3 +0x62,0xf6,0x7c,0xcf,0x4c,0xd3 + +# ATT: vrcppbf16 %ymm3, %ymm2 +# INTEL: vrcppbf16 ymm2, ymm3 +0x62,0xf6,0x7c,0x28,0x4c,0xd3 + +# ATT: vrcppbf16 %ymm3, %ymm2 {%k7} +# INTEL: vrcppbf16 ymm2 {k7}, ymm3 +0x62,0xf6,0x7c,0x2f,0x4c,0xd3 + +# ATT: vrcppbf16 %ymm3, %ymm2 {%k7} {z} +# INTEL: vrcppbf16 ymm2 {k7} {z}, ymm3 +0x62,0xf6,0x7c,0xaf,0x4c,0xd3 + +# ATT: vrcppbf16 268435456(%esp,%esi,8), %xmm2 +# INTEL: vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%eax){1to8}, %xmm2 +# INTEL: vrcppbf16 xmm2, word ptr [eax]{1to8} +0x62,0xf6,0x7c,0x18,0x4c,0x10 + +# ATT: vrcppbf16 -512(,%ebp,2), %xmm2 +# INTEL: vrcppbf16 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vrcppbf16 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f + +# ATT: vrcppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80 + +# ATT: vrcppbf16 268435456(%esp,%esi,8), %ymm2 +# INTEL: vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%eax){1to16}, %ymm2 +# INTEL: vrcppbf16 ymm2, word ptr [eax]{1to16} +0x62,0xf6,0x7c,0x38,0x4c,0x10 + +# ATT: vrcppbf16 -1024(,%ebp,2), %ymm2 +# INTEL: vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vrcppbf16 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f + +# ATT: vrcppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80 + +# ATT: vrcppbf16 268435456(%esp,%esi,8), %zmm2 +# INTEL: vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%eax){1to32}, %zmm2 +# INTEL: vrcppbf16 zmm2, word ptr [eax]{1to32} +0x62,0xf6,0x7c,0x58,0x4c,0x10 + +# ATT: vrcppbf16 -2048(,%ebp,2), %zmm2 +# INTEL: vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vrcppbf16 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f + +# ATT: vrcppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80 + +# ATT: vreducenepbf16 $123, %zmm3, %zmm2 +# INTEL: vreducenepbf16 zmm2, zmm3, 123 +0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %zmm3, %zmm2 {%k7} +# INTEL: vreducenepbf16 zmm2 {k7}, zmm3, 123 +0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z} +# INTEL: vreducenepbf16 zmm2 {k7} {z}, zmm3, 123 +0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %ymm3, %ymm2 +# INTEL: vreducenepbf16 ymm2, ymm3, 123 +0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %ymm3, %ymm2 {%k7} +# INTEL: vreducenepbf16 ymm2 {k7}, ymm3, 123 +0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z} +# INTEL: vreducenepbf16 ymm2 {k7} {z}, ymm3, 123 +0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %xmm3, %xmm2 +# INTEL: vreducenepbf16 xmm2, xmm3, 123 +0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %xmm3, %xmm2 {%k7} +# INTEL: vreducenepbf16 xmm2 {k7}, xmm3, 123 +0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z} +# INTEL: vreducenepbf16 xmm2 {k7} {z}, xmm3, 123 +0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 +# INTEL: vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%eax){1to8}, %xmm2 +# INTEL: vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123 +0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b + +# ATT: vreducenepbf16 $123, -512(,%ebp,2), %xmm2 +# INTEL: vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 +# INTEL: vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%eax){1to16}, %ymm2 +# INTEL: vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123 +0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b + +# ATT: vreducenepbf16 $123, -1024(,%ebp,2), %ymm2 +# INTEL: vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 +# INTEL: vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%eax){1to32}, %zmm2 +# INTEL: vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123 +0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b + +# ATT: vreducenepbf16 $123, -2048(,%ebp,2), %zmm2 +# INTEL: vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm3, %zmm2 +# INTEL: vrndscalenepbf16 zmm2, zmm3, 123 +0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} +# INTEL: vrndscalenepbf16 zmm2 {k7}, zmm3, 123 +0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123 +0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm3, %ymm2 +# INTEL: vrndscalenepbf16 ymm2, ymm3, 123 +0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} +# INTEL: vrndscalenepbf16 ymm2 {k7}, ymm3, 123 +0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123 +0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm3, %xmm2 +# INTEL: vrndscalenepbf16 xmm2, xmm3, 123 +0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} +# INTEL: vrndscalenepbf16 xmm2 {k7}, xmm3, 123 +0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123 +0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 +# INTEL: vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%eax){1to8}, %xmm2 +# INTEL: vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123 +0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, -512(,%ebp,2), %xmm2 +# INTEL: vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 +# INTEL: vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%eax){1to16}, %ymm2 +# INTEL: vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123 +0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, -1024(,%ebp,2), %ymm2 +# INTEL: vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 +# INTEL: vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%eax){1to32}, %zmm2 +# INTEL: vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123 +0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, -2048(,%ebp,2), %zmm2 +# INTEL: vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b + +# ATT: vrsqrtpbf16 %xmm3, %xmm2 +# INTEL: vrsqrtpbf16 xmm2, xmm3 +0x62,0xf6,0x7c,0x08,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %xmm3, %xmm2 {%k7} +# INTEL: vrsqrtpbf16 xmm2 {k7}, xmm3 +0x62,0xf6,0x7c,0x0f,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, xmm3 +0x62,0xf6,0x7c,0x8f,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %zmm3, %zmm2 +# INTEL: vrsqrtpbf16 zmm2, zmm3 +0x62,0xf6,0x7c,0x48,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %zmm3, %zmm2 {%k7} +# INTEL: vrsqrtpbf16 zmm2 {k7}, zmm3 +0x62,0xf6,0x7c,0x4f,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, zmm3 +0x62,0xf6,0x7c,0xcf,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %ymm3, %ymm2 +# INTEL: vrsqrtpbf16 ymm2, ymm3 +0x62,0xf6,0x7c,0x28,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %ymm3, %ymm2 {%k7} +# INTEL: vrsqrtpbf16 ymm2 {k7}, ymm3 +0x62,0xf6,0x7c,0x2f,0x4e,0xd3 + +# ATT: vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, ymm3 +0x62,0xf6,0x7c,0xaf,0x4e,0xd3 + +# ATT: vrsqrtpbf16 268435456(%esp,%esi,8), %xmm2 +# INTEL: vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%eax){1to8}, %xmm2 +# INTEL: vrsqrtpbf16 xmm2, word ptr [eax]{1to8} +0x62,0xf6,0x7c,0x18,0x4e,0x10 + +# ATT: vrsqrtpbf16 -512(,%ebp,2), %xmm2 +# INTEL: vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vrsqrtpbf16 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f + +# ATT: vrsqrtpbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80 + +# ATT: vrsqrtpbf16 268435456(%esp,%esi,8), %ymm2 +# INTEL: vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%eax){1to16}, %ymm2 +# INTEL: vrsqrtpbf16 ymm2, word ptr [eax]{1to16} +0x62,0xf6,0x7c,0x38,0x4e,0x10 + +# ATT: vrsqrtpbf16 -1024(,%ebp,2), %ymm2 +# INTEL: vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vrsqrtpbf16 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f + +# ATT: vrsqrtpbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80 + +# ATT: vrsqrtpbf16 268435456(%esp,%esi,8), %zmm2 +# INTEL: vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%eax){1to32}, %zmm2 +# INTEL: vrsqrtpbf16 zmm2, word ptr [eax]{1to32} +0x62,0xf6,0x7c,0x58,0x4e,0x10 + +# ATT: vrsqrtpbf16 -2048(,%ebp,2), %zmm2 +# INTEL: vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vrsqrtpbf16 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f + +# ATT: vrsqrtpbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80 + +# ATT: vscalefpbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vscalefpbf16 ymm2, ymm3, ymm4 +0x62,0xf6,0x64,0x28,0x2c,0xd4 + +# ATT: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vscalefpbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf6,0x64,0x2f,0x2c,0xd4 + +# ATT: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf6,0x64,0xaf,0x2c,0xd4 + +# ATT: vscalefpbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vscalefpbf16 zmm2, zmm3, zmm4 +0x62,0xf6,0x64,0x48,0x2c,0xd4 + +# ATT: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vscalefpbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf6,0x64,0x4f,0x2c,0xd4 + +# ATT: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf6,0x64,0xcf,0x2c,0xd4 + +# ATT: vscalefpbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vscalefpbf16 xmm2, xmm3, xmm4 +0x62,0xf6,0x64,0x08,0x2c,0xd4 + +# ATT: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vscalefpbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf6,0x64,0x0f,0x2c,0xd4 + +# ATT: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf6,0x64,0x8f,0x2c,0xd4 + +# ATT: vscalefpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf6,0x64,0x58,0x2c,0x10 + +# ATT: vscalefpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vscalefpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f + +# ATT: vscalefpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80 + +# ATT: vscalefpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf6,0x64,0x38,0x2c,0x10 + +# ATT: vscalefpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vscalefpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f + +# ATT: vscalefpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80 + +# ATT: vscalefpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf6,0x64,0x18,0x2c,0x10 + +# ATT: vscalefpbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vscalefpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f + +# ATT: vscalefpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80 + +# ATT: vsqrtnepbf16 %xmm3, %xmm2 +# INTEL: vsqrtnepbf16 xmm2, xmm3 +0x62,0xf5,0x7d,0x08,0x51,0xd3 + +# ATT: vsqrtnepbf16 %xmm3, %xmm2 {%k7} +# INTEL: vsqrtnepbf16 xmm2 {k7}, xmm3 +0x62,0xf5,0x7d,0x0f,0x51,0xd3 + +# ATT: vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7d,0x8f,0x51,0xd3 + +# ATT: vsqrtnepbf16 %zmm3, %zmm2 +# INTEL: vsqrtnepbf16 zmm2, zmm3 +0x62,0xf5,0x7d,0x48,0x51,0xd3 + +# ATT: vsqrtnepbf16 %zmm3, %zmm2 {%k7} +# INTEL: vsqrtnepbf16 zmm2 {k7}, zmm3 +0x62,0xf5,0x7d,0x4f,0x51,0xd3 + +# ATT: vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, zmm3 +0x62,0xf5,0x7d,0xcf,0x51,0xd3 + +# ATT: vsqrtnepbf16 %ymm3, %ymm2 +# INTEL: vsqrtnepbf16 ymm2, ymm3 +0x62,0xf5,0x7d,0x28,0x51,0xd3 + +# ATT: vsqrtnepbf16 %ymm3, %ymm2 {%k7} +# INTEL: vsqrtnepbf16 ymm2 {k7}, ymm3 +0x62,0xf5,0x7d,0x2f,0x51,0xd3 + +# ATT: vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, ymm3 +0x62,0xf5,0x7d,0xaf,0x51,0xd3 + +# ATT: vsqrtnepbf16 268435456(%esp,%esi,8), %xmm2 +# INTEL: vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%eax){1to8}, %xmm2 +# INTEL: vsqrtnepbf16 xmm2, word ptr [eax]{1to8} +0x62,0xf5,0x7d,0x18,0x51,0x10 + +# ATT: vsqrtnepbf16 -512(,%ebp,2), %xmm2 +# INTEL: vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vsqrtnepbf16 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f + +# ATT: vsqrtnepbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80 + +# ATT: vsqrtnepbf16 268435456(%esp,%esi,8), %ymm2 +# INTEL: vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%eax){1to16}, %ymm2 +# INTEL: vsqrtnepbf16 ymm2, word ptr [eax]{1to16} +0x62,0xf5,0x7d,0x38,0x51,0x10 + +# ATT: vsqrtnepbf16 -1024(,%ebp,2), %ymm2 +# INTEL: vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vsqrtnepbf16 4064(%ecx), %ymm2 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f + +# ATT: vsqrtnepbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80 + +# ATT: vsqrtnepbf16 268435456(%esp,%esi,8), %zmm2 +# INTEL: vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%eax){1to32}, %zmm2 +# INTEL: vsqrtnepbf16 zmm2, word ptr [eax]{1to32} +0x62,0xf5,0x7d,0x58,0x51,0x10 + +# ATT: vsqrtnepbf16 -2048(,%ebp,2), %zmm2 +# INTEL: vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vsqrtnepbf16 8128(%ecx), %zmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f + +# ATT: vsqrtnepbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80 + +# ATT: vsubnepbf16 %ymm4, %ymm3, %ymm2 +# INTEL: vsubnepbf16 ymm2, ymm3, ymm4 +0x62,0xf5,0x65,0x28,0x5c,0xd4 + +# ATT: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vsubnepbf16 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x65,0x2f,0x5c,0xd4 + +# ATT: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x65,0xaf,0x5c,0xd4 + +# ATT: vsubnepbf16 %zmm4, %zmm3, %zmm2 +# INTEL: vsubnepbf16 zmm2, zmm3, zmm4 +0x62,0xf5,0x65,0x48,0x5c,0xd4 + +# ATT: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vsubnepbf16 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x65,0x4f,0x5c,0xd4 + +# ATT: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x65,0xcf,0x5c,0xd4 + +# ATT: vsubnepbf16 %xmm4, %xmm3, %xmm2 +# INTEL: vsubnepbf16 xmm2, xmm3, xmm4 +0x62,0xf5,0x65,0x08,0x5c,0xd4 + +# ATT: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vsubnepbf16 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x65,0x0f,0x5c,0xd4 + +# ATT: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x65,0x8f,0x5c,0xd4 + +# ATT: vsubnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x65,0x58,0x5c,0x10 + +# ATT: vsubnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vsubnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f + +# ATT: vsubnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80 + +# ATT: vsubnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x65,0x38,0x5c,0x10 + +# ATT: vsubnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vsubnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f + +# ATT: vsubnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80 + +# ATT: vsubnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x65,0x18,0x5c,0x10 + +# ATT: vsubnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vsubnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f + +# ATT: vsubnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80 + diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt new file mode 100644 index 0000000000000..953ef8dd8a14c --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt @@ -0,0 +1,3015 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vaddnepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vaddnepbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x58,0xf0 + +# ATT: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vaddnepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x58,0xf0 + +# ATT: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x58,0xf0 + +# ATT: vaddnepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vaddnepbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x58,0xf0 + +# ATT: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vaddnepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x58,0xf0 + +# ATT: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x58,0xf0 + +# ATT: vaddnepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vaddnepbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x58,0xf0 + +# ATT: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vaddnepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x58,0xf0 + +# ATT: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x58,0xf0 + +# ATT: vaddnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00 + +# ATT: vaddnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vaddnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f + +# ATT: vaddnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x58,0x72,0x80 + +# ATT: vaddnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00 + +# ATT: vaddnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vaddnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f + +# ATT: vaddnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x58,0x72,0x80 + +# ATT: vaddnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vaddnepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00 + +# ATT: vaddnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vaddnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x58,0x71,0x7f + +# ATT: vaddnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x58,0x72,0x80 + +# ATT: vcmppbf16 $123, %ymm24, %ymm23, %k5 +# INTEL: vcmppbf16 k5, ymm23, ymm24, 123 +0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm23, ymm24, 123 +0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, %xmm24, %xmm23, %k5 +# INTEL: vcmppbf16 k5, xmm23, xmm24, 123 +0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm23, xmm24, 123 +0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, %zmm24, %zmm23, %k5 +# INTEL: vcmppbf16 k5, zmm23, zmm24, 123 +0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm23, zmm24, 123 +0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b + +# ATT: vcmppbf16 $123, 268435456(%rbp,%r14,8), %zmm23, %k5 +# INTEL: vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%rip){1to32}, %zmm23, %k5 +# INTEL: vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123 +0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, -2048(,%rbp,2), %zmm23, %k5 +# INTEL: vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123 +0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 8128(%rcx), %zmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123 +0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123 +0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b + +# ATT: vcmppbf16 $123, 268435456(%rbp,%r14,8), %xmm23, %k5 +# INTEL: vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%rip){1to8}, %xmm23, %k5 +# INTEL: vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123 +0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, -512(,%rbp,2), %xmm23, %k5 +# INTEL: vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123 +0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 2032(%rcx), %xmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123 +0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123 +0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b + +# ATT: vcmppbf16 $123, 268435456(%rbp,%r14,8), %ymm23, %k5 +# INTEL: vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vcmppbf16 $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123 +0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, (%rip){1to16}, %ymm23, %k5 +# INTEL: vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123 +0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vcmppbf16 $123, -1024(,%rbp,2), %ymm23, %k5 +# INTEL: vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123 +0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vcmppbf16 $123, 4064(%rcx), %ymm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123 +0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b + +# ATT: vcmppbf16 $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7} +# INTEL: vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123 +0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b + +# ATT: vcomsbf16 %xmm23, %xmm22 +# INTEL: vcomsbf16 xmm22, xmm23 +0x62,0xa5,0x7d,0x08,0x2f,0xf7 + +# ATT: vcomsbf16 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomsbf16 291(%r8,%rax,4), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcomsbf16 (%rip), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [rip] +0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcomsbf16 -64(,%rbp,2), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [2*rbp - 64] +0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vcomsbf16 254(%rcx), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [rcx + 254] +0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f + +# ATT: vcomsbf16 -256(%rdx), %xmm22 +# INTEL: vcomsbf16 xmm22, word ptr [rdx - 256] +0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80 + +# ATT: vdivnepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vdivnepbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x5e,0xf0 + +# ATT: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vdivnepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x5e,0xf0 + +# ATT: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x5e,0xf0 + +# ATT: vdivnepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vdivnepbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x5e,0xf0 + +# ATT: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vdivnepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x5e,0xf0 + +# ATT: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x5e,0xf0 + +# ATT: vdivnepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vdivnepbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x5e,0xf0 + +# ATT: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vdivnepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x5e,0xf0 + +# ATT: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x5e,0xf0 + +# ATT: vdivnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdivnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vdivnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f + +# ATT: vdivnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80 + +# ATT: vdivnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdivnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vdivnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f + +# ATT: vdivnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80 + +# ATT: vdivnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdivnepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdivnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vdivnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f + +# ATT: vdivnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x5e,0x72,0x80 + +# ATT: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0x98,0xf0 + +# ATT: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0x98,0xf0 + +# ATT: vfmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f + +# ATT: vfmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0x98,0x72,0x80 + +# ATT: vfmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f + +# ATT: vfmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0x98,0x72,0x80 + +# ATT: vfmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0x98,0x71,0x7f + +# ATT: vfmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0x98,0x72,0x80 + +# ATT: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xa8,0xf0 + +# ATT: vfmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f + +# ATT: vfmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80 + +# ATT: vfmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f + +# ATT: vfmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80 + +# ATT: vfmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f + +# ATT: vfmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xa8,0x72,0x80 + +# ATT: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xb8,0xf0 + +# ATT: vfmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f + +# ATT: vfmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80 + +# ATT: vfmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f + +# ATT: vfmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80 + +# ATT: vfmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f + +# ATT: vfmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xb8,0x72,0x80 + +# ATT: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0x9a,0xf0 + +# ATT: vfmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f + +# ATT: vfmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80 + +# ATT: vfmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f + +# ATT: vfmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80 + +# ATT: vfmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f + +# ATT: vfmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0x9a,0x72,0x80 + +# ATT: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xaa,0xf0 + +# ATT: vfmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f + +# ATT: vfmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80 + +# ATT: vfmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f + +# ATT: vfmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80 + +# ATT: vfmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f + +# ATT: vfmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xaa,0x72,0x80 + +# ATT: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xba,0xf0 + +# ATT: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xba,0xf0 + +# ATT: vfmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f + +# ATT: vfmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xba,0x72,0x80 + +# ATT: vfmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f + +# ATT: vfmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xba,0x72,0x80 + +# ATT: vfmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xba,0x71,0x7f + +# ATT: vfmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xba,0x72,0x80 + +# ATT: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0x9c,0xf0 + +# ATT: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f + +# ATT: vfnmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80 + +# ATT: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f + +# ATT: vfnmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80 + +# ATT: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f + +# ATT: vfnmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0x9c,0x72,0x80 + +# ATT: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xac,0xf0 + +# ATT: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f + +# ATT: vfnmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xac,0x72,0x80 + +# ATT: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f + +# ATT: vfnmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xac,0x72,0x80 + +# ATT: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xac,0x71,0x7f + +# ATT: vfnmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xac,0x72,0x80 + +# ATT: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xbc,0xf0 + +# ATT: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f + +# ATT: vfnmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80 + +# ATT: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f + +# ATT: vfnmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80 + +# ATT: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f + +# ATT: vfnmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xbc,0x72,0x80 + +# ATT: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0x9e,0xf0 + +# ATT: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f + +# ATT: vfnmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80 + +# ATT: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f + +# ATT: vfnmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80 + +# ATT: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f + +# ATT: vfnmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0x9e,0x72,0x80 + +# ATT: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xae,0xf0 + +# ATT: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f + +# ATT: vfnmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xae,0x72,0x80 + +# ATT: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f + +# ATT: vfnmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xae,0x72,0x80 + +# ATT: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xae,0x71,0x7f + +# ATT: vfnmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xae,0x72,0x80 + +# ATT: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0xbe,0xf0 + +# ATT: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vfnmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f + +# ATT: vfnmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80 + +# ATT: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vfnmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f + +# ATT: vfnmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80 + +# ATT: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vfnmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00 + +# ATT: vfnmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vfnmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f + +# ATT: vfnmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0xbe,0x72,0x80 + +# ATT: vfpclasspbf16 $123, %zmm23, %k5 +# INTEL: vfpclasspbf16 k5, zmm23, 123 +0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b + +# ATT: vfpclasspbf16 $123, %zmm23, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, zmm23, 123 +0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b + +# ATT: vfpclasspbf16 $123, %ymm23, %k5 +# INTEL: vfpclasspbf16 k5, ymm23, 123 +0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b + +# ATT: vfpclasspbf16 $123, %ymm23, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, ymm23, 123 +0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b + +# ATT: vfpclasspbf16 $123, %xmm23, %k5 +# INTEL: vfpclasspbf16 k5, xmm23, 123 +0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b + +# ATT: vfpclasspbf16 $123, %xmm23, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmm23, 123 +0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b + +# ATT: vfpclasspbf16x $123, 268435456(%rbp,%r14,8), %k5 +# INTEL: vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vfpclasspbf16x $123, 291(%r8,%rax,4), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vfpclasspbf16 $123, (%rip){1to8}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to8}, 123 +0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vfpclasspbf16x $123, -512(,%rbp,2), %k5 +# INTEL: vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123 +0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vfpclasspbf16x $123, 2032(%rcx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123 +0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%rdx){1to8}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123 +0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclasspbf16 $123, (%rip){1to16}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to16}, 123 +0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vfpclasspbf16y $123, -1024(,%rbp,2), %k5 +# INTEL: vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123 +0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vfpclasspbf16y $123, 4064(%rcx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123 +0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%rdx){1to16}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123 +0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclasspbf16 $123, (%rip){1to32}, %k5 +# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to32}, 123 +0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b + +# ATT: vfpclasspbf16z $123, -2048(,%rbp,2), %k5 +# INTEL: vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123 +0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vfpclasspbf16z $123, 8128(%rcx), %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123 +0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b + +# ATT: vfpclasspbf16 $123, -256(%rdx){1to32}, %k5 {%k7} +# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123 +0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b + +# ATT: vgetexppbf16 %xmm23, %xmm22 +# INTEL: vgetexppbf16 xmm22, xmm23 +0x62,0xa5,0x7d,0x08,0x42,0xf7 + +# ATT: vgetexppbf16 %xmm23, %xmm22 {%k7} +# INTEL: vgetexppbf16 xmm22 {k7}, xmm23 +0x62,0xa5,0x7d,0x0f,0x42,0xf7 + +# ATT: vgetexppbf16 %xmm23, %xmm22 {%k7} {z} +# INTEL: vgetexppbf16 xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7d,0x8f,0x42,0xf7 + +# ATT: vgetexppbf16 %zmm23, %zmm22 +# INTEL: vgetexppbf16 zmm22, zmm23 +0x62,0xa5,0x7d,0x48,0x42,0xf7 + +# ATT: vgetexppbf16 %zmm23, %zmm22 {%k7} +# INTEL: vgetexppbf16 zmm22 {k7}, zmm23 +0x62,0xa5,0x7d,0x4f,0x42,0xf7 + +# ATT: vgetexppbf16 %zmm23, %zmm22 {%k7} {z} +# INTEL: vgetexppbf16 zmm22 {k7} {z}, zmm23 +0x62,0xa5,0x7d,0xcf,0x42,0xf7 + +# ATT: vgetexppbf16 %ymm23, %ymm22 +# INTEL: vgetexppbf16 ymm22, ymm23 +0x62,0xa5,0x7d,0x28,0x42,0xf7 + +# ATT: vgetexppbf16 %ymm23, %ymm22 {%k7} +# INTEL: vgetexppbf16 ymm22 {k7}, ymm23 +0x62,0xa5,0x7d,0x2f,0x42,0xf7 + +# ATT: vgetexppbf16 %ymm23, %ymm22 {%k7} {z} +# INTEL: vgetexppbf16 ymm22 {k7} {z}, ymm23 +0x62,0xa5,0x7d,0xaf,0x42,0xf7 + +# ATT: vgetexppbf16 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%rip){1to8}, %xmm22 +# INTEL: vgetexppbf16 xmm22, word ptr [rip]{1to8} +0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00 + +# ATT: vgetexppbf16 -512(,%rbp,2), %xmm22 +# INTEL: vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vgetexppbf16 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f + +# ATT: vgetexppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80 + +# ATT: vgetexppbf16 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%rip){1to16}, %ymm22 +# INTEL: vgetexppbf16 ymm22, word ptr [rip]{1to16} +0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00 + +# ATT: vgetexppbf16 -1024(,%rbp,2), %ymm22 +# INTEL: vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vgetexppbf16 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f + +# ATT: vgetexppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80 + +# ATT: vgetexppbf16 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vgetexppbf16 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vgetexppbf16 (%rip){1to32}, %zmm22 +# INTEL: vgetexppbf16 zmm22, word ptr [rip]{1to32} +0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00 + +# ATT: vgetexppbf16 -2048(,%rbp,2), %zmm22 +# INTEL: vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vgetexppbf16 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f + +# ATT: vgetexppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80 + +# ATT: vgetmantpbf16 $123, %zmm23, %zmm22 +# INTEL: vgetmantpbf16 zmm22, zmm23, 123 +0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} +# INTEL: vgetmantpbf16 zmm22 {k7}, zmm23, 123 +0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z} +# INTEL: vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123 +0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %ymm23, %ymm22 +# INTEL: vgetmantpbf16 ymm22, ymm23, 123 +0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} +# INTEL: vgetmantpbf16 ymm22 {k7}, ymm23, 123 +0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z} +# INTEL: vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123 +0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %xmm23, %xmm22 +# INTEL: vgetmantpbf16 xmm22, xmm23, 123 +0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} +# INTEL: vgetmantpbf16 xmm22 {k7}, xmm23, 123 +0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z} +# INTEL: vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123 +0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%rip){1to8}, %xmm22 +# INTEL: vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123 +0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, -512(,%rbp,2), %xmm22 +# INTEL: vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%rip){1to16}, %ymm22 +# INTEL: vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123 +0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, -1024(,%rbp,2), %ymm22 +# INTEL: vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b + +# ATT: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantpbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, (%rip){1to32}, %zmm22 +# INTEL: vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123 +0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vgetmantpbf16 $123, -2048(,%rbp,2), %zmm22 +# INTEL: vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vgetmantpbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantpbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b + +# ATT: vmaxpbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vmaxpbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x5f,0xf0 + +# ATT: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vmaxpbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x5f,0xf0 + +# ATT: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x5f,0xf0 + +# ATT: vmaxpbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vmaxpbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x5f,0xf0 + +# ATT: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vmaxpbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x5f,0xf0 + +# ATT: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x5f,0xf0 + +# ATT: vmaxpbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vmaxpbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x5f,0xf0 + +# ATT: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vmaxpbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x5f,0xf0 + +# ATT: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x5f,0xf0 + +# ATT: vmaxpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmaxpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vmaxpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f + +# ATT: vmaxpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80 + +# ATT: vmaxpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmaxpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vmaxpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f + +# ATT: vmaxpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80 + +# ATT: vmaxpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmaxpbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmaxpbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vmaxpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f + +# ATT: vmaxpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x5f,0x72,0x80 + +# ATT: vminpbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vminpbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x5d,0xf0 + +# ATT: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vminpbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x5d,0xf0 + +# ATT: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x5d,0xf0 + +# ATT: vminpbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vminpbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x5d,0xf0 + +# ATT: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vminpbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x5d,0xf0 + +# ATT: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x5d,0xf0 + +# ATT: vminpbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vminpbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x5d,0xf0 + +# ATT: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vminpbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x5d,0xf0 + +# ATT: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x5d,0xf0 + +# ATT: vminpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vminpbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00 + +# ATT: vminpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vminpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f + +# ATT: vminpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80 + +# ATT: vminpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vminpbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00 + +# ATT: vminpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vminpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f + +# ATT: vminpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80 + +# ATT: vminpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vminpbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vminpbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00 + +# ATT: vminpbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vminpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f + +# ATT: vminpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x5d,0x72,0x80 + +# ATT: vmulnepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vmulnepbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x59,0xf0 + +# ATT: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vmulnepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x59,0xf0 + +# ATT: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x59,0xf0 + +# ATT: vmulnepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vmulnepbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x59,0xf0 + +# ATT: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vmulnepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x59,0xf0 + +# ATT: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x59,0xf0 + +# ATT: vmulnepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vmulnepbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x59,0xf0 + +# ATT: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vmulnepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x59,0xf0 + +# ATT: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x59,0xf0 + +# ATT: vmulnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmulnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vmulnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f + +# ATT: vmulnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x59,0x72,0x80 + +# ATT: vmulnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmulnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vmulnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f + +# ATT: vmulnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x59,0x72,0x80 + +# ATT: vmulnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vmulnepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00 + +# ATT: vmulnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vmulnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x59,0x71,0x7f + +# ATT: vmulnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x59,0x72,0x80 + +# ATT: vrcppbf16 %xmm23, %xmm22 +# INTEL: vrcppbf16 xmm22, xmm23 +0x62,0xa6,0x7c,0x08,0x4c,0xf7 + +# ATT: vrcppbf16 %xmm23, %xmm22 {%k7} +# INTEL: vrcppbf16 xmm22 {k7}, xmm23 +0x62,0xa6,0x7c,0x0f,0x4c,0xf7 + +# ATT: vrcppbf16 %xmm23, %xmm22 {%k7} {z} +# INTEL: vrcppbf16 xmm22 {k7} {z}, xmm23 +0x62,0xa6,0x7c,0x8f,0x4c,0xf7 + +# ATT: vrcppbf16 %zmm23, %zmm22 +# INTEL: vrcppbf16 zmm22, zmm23 +0x62,0xa6,0x7c,0x48,0x4c,0xf7 + +# ATT: vrcppbf16 %zmm23, %zmm22 {%k7} +# INTEL: vrcppbf16 zmm22 {k7}, zmm23 +0x62,0xa6,0x7c,0x4f,0x4c,0xf7 + +# ATT: vrcppbf16 %zmm23, %zmm22 {%k7} {z} +# INTEL: vrcppbf16 zmm22 {k7} {z}, zmm23 +0x62,0xa6,0x7c,0xcf,0x4c,0xf7 + +# ATT: vrcppbf16 %ymm23, %ymm22 +# INTEL: vrcppbf16 ymm22, ymm23 +0x62,0xa6,0x7c,0x28,0x4c,0xf7 + +# ATT: vrcppbf16 %ymm23, %ymm22 {%k7} +# INTEL: vrcppbf16 ymm22 {k7}, ymm23 +0x62,0xa6,0x7c,0x2f,0x4c,0xf7 + +# ATT: vrcppbf16 %ymm23, %ymm22 {%k7} {z} +# INTEL: vrcppbf16 ymm22 {k7} {z}, ymm23 +0x62,0xa6,0x7c,0xaf,0x4c,0xf7 + +# ATT: vrcppbf16 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%rip){1to8}, %xmm22 +# INTEL: vrcppbf16 xmm22, word ptr [rip]{1to8} +0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrcppbf16 -512(,%rbp,2), %xmm22 +# INTEL: vrcppbf16 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vrcppbf16 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f + +# ATT: vrcppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80 + +# ATT: vrcppbf16 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%rip){1to16}, %ymm22 +# INTEL: vrcppbf16 ymm22, word ptr [rip]{1to16} +0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrcppbf16 -1024(,%rbp,2), %ymm22 +# INTEL: vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vrcppbf16 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f + +# ATT: vrcppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80 + +# ATT: vrcppbf16 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrcppbf16 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrcppbf16 (%rip){1to32}, %zmm22 +# INTEL: vrcppbf16 zmm22, word ptr [rip]{1to32} +0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrcppbf16 -2048(,%rbp,2), %zmm22 +# INTEL: vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vrcppbf16 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f + +# ATT: vrcppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80 + +# ATT: vreducenepbf16 $123, %zmm23, %zmm22 +# INTEL: vreducenepbf16 zmm22, zmm23, 123 +0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %zmm23, %zmm22 {%k7} +# INTEL: vreducenepbf16 zmm22 {k7}, zmm23, 123 +0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z} +# INTEL: vreducenepbf16 zmm22 {k7} {z}, zmm23, 123 +0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %ymm23, %ymm22 +# INTEL: vreducenepbf16 ymm22, ymm23, 123 +0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %ymm23, %ymm22 {%k7} +# INTEL: vreducenepbf16 ymm22 {k7}, ymm23, 123 +0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z} +# INTEL: vreducenepbf16 ymm22 {k7} {z}, ymm23, 123 +0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %xmm23, %xmm22 +# INTEL: vreducenepbf16 xmm22, xmm23, 123 +0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %xmm23, %xmm22 {%k7} +# INTEL: vreducenepbf16 xmm22 {k7}, xmm23, 123 +0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z} +# INTEL: vreducenepbf16 xmm22 {k7} {z}, xmm23, 123 +0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%rip){1to8}, %xmm22 +# INTEL: vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123 +0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, -512(,%rbp,2), %xmm22 +# INTEL: vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%rip){1to16}, %ymm22 +# INTEL: vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123 +0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, -1024(,%rbp,2), %ymm22 +# INTEL: vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b + +# ATT: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, (%rip){1to32}, %zmm22 +# INTEL: vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123 +0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vreducenepbf16 $123, -2048(,%rbp,2), %zmm22 +# INTEL: vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vreducenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b + +# ATT: vreducenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm23, %zmm22 +# INTEL: vrndscalenepbf16 zmm22, zmm23, 123 +0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} +# INTEL: vrndscalenepbf16 zmm22 {k7}, zmm23, 123 +0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123 +0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm23, %ymm22 +# INTEL: vrndscalenepbf16 ymm22, ymm23, 123 +0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} +# INTEL: vrndscalenepbf16 ymm22 {k7}, ymm23, 123 +0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123 +0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm23, %xmm22 +# INTEL: vrndscalenepbf16 xmm22, xmm23, 123 +0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} +# INTEL: vrndscalenepbf16 xmm22 {k7}, xmm23, 123 +0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123 +0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%rip){1to8}, %xmm22 +# INTEL: vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123 +0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, -512(,%rbp,2), %xmm22 +# INTEL: vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%rip){1to16}, %ymm22 +# INTEL: vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123 +0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, -1024(,%rbp,2), %ymm22 +# INTEL: vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b + +# ATT: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, (%rip){1to32}, %zmm22 +# INTEL: vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123 +0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b + +# ATT: vrndscalenepbf16 $123, -2048(,%rbp,2), %zmm22 +# INTEL: vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b + +# ATT: vrndscalenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b + +# ATT: vrndscalenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b + +# ATT: vrsqrtpbf16 %xmm23, %xmm22 +# INTEL: vrsqrtpbf16 xmm22, xmm23 +0x62,0xa6,0x7c,0x08,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %xmm23, %xmm22 {%k7} +# INTEL: vrsqrtpbf16 xmm22 {k7}, xmm23 +0x62,0xa6,0x7c,0x0f,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, xmm23 +0x62,0xa6,0x7c,0x8f,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %zmm23, %zmm22 +# INTEL: vrsqrtpbf16 zmm22, zmm23 +0x62,0xa6,0x7c,0x48,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %zmm23, %zmm22 {%k7} +# INTEL: vrsqrtpbf16 zmm22 {k7}, zmm23 +0x62,0xa6,0x7c,0x4f,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, zmm23 +0x62,0xa6,0x7c,0xcf,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %ymm23, %ymm22 +# INTEL: vrsqrtpbf16 ymm22, ymm23 +0x62,0xa6,0x7c,0x28,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %ymm23, %ymm22 {%k7} +# INTEL: vrsqrtpbf16 ymm22 {k7}, ymm23 +0x62,0xa6,0x7c,0x2f,0x4e,0xf7 + +# ATT: vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, ymm23 +0x62,0xa6,0x7c,0xaf,0x4e,0xf7 + +# ATT: vrsqrtpbf16 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%rip){1to8}, %xmm22 +# INTEL: vrsqrtpbf16 xmm22, word ptr [rip]{1to8} +0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrsqrtpbf16 -512(,%rbp,2), %xmm22 +# INTEL: vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vrsqrtpbf16 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f + +# ATT: vrsqrtpbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80 + +# ATT: vrsqrtpbf16 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%rip){1to16}, %ymm22 +# INTEL: vrsqrtpbf16 ymm22, word ptr [rip]{1to16} +0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrsqrtpbf16 -1024(,%rbp,2), %ymm22 +# INTEL: vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vrsqrtpbf16 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f + +# ATT: vrsqrtpbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80 + +# ATT: vrsqrtpbf16 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtpbf16 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vrsqrtpbf16 (%rip){1to32}, %zmm22 +# INTEL: vrsqrtpbf16 zmm22, word ptr [rip]{1to32} +0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vrsqrtpbf16 -2048(,%rbp,2), %zmm22 +# INTEL: vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vrsqrtpbf16 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f + +# ATT: vrsqrtpbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80 + +# ATT: vscalefpbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vscalefpbf16 ymm22, ymm23, ymm24 +0x62,0x86,0x44,0x20,0x2c,0xf0 + +# ATT: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vscalefpbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x86,0x44,0x27,0x2c,0xf0 + +# ATT: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x86,0x44,0xa7,0x2c,0xf0 + +# ATT: vscalefpbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vscalefpbf16 zmm22, zmm23, zmm24 +0x62,0x86,0x44,0x40,0x2c,0xf0 + +# ATT: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vscalefpbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x86,0x44,0x47,0x2c,0xf0 + +# ATT: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x86,0x44,0xc7,0x2c,0xf0 + +# ATT: vscalefpbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vscalefpbf16 xmm22, xmm23, xmm24 +0x62,0x86,0x44,0x00,0x2c,0xf0 + +# ATT: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vscalefpbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x86,0x44,0x07,0x2c,0xf0 + +# ATT: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x86,0x44,0x87,0x2c,0xf0 + +# ATT: vscalefpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vscalefpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vscalefpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f + +# ATT: vscalefpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80 + +# ATT: vscalefpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vscalefpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vscalefpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f + +# ATT: vscalefpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80 + +# ATT: vscalefpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vscalefpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vscalefpbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vscalefpbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vscalefpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f + +# ATT: vscalefpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe6,0x44,0x97,0x2c,0x72,0x80 + +# ATT: vsqrtnepbf16 %xmm23, %xmm22 +# INTEL: vsqrtnepbf16 xmm22, xmm23 +0x62,0xa5,0x7d,0x08,0x51,0xf7 + +# ATT: vsqrtnepbf16 %xmm23, %xmm22 {%k7} +# INTEL: vsqrtnepbf16 xmm22 {k7}, xmm23 +0x62,0xa5,0x7d,0x0f,0x51,0xf7 + +# ATT: vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7d,0x8f,0x51,0xf7 + +# ATT: vsqrtnepbf16 %zmm23, %zmm22 +# INTEL: vsqrtnepbf16 zmm22, zmm23 +0x62,0xa5,0x7d,0x48,0x51,0xf7 + +# ATT: vsqrtnepbf16 %zmm23, %zmm22 {%k7} +# INTEL: vsqrtnepbf16 zmm22 {k7}, zmm23 +0x62,0xa5,0x7d,0x4f,0x51,0xf7 + +# ATT: vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, zmm23 +0x62,0xa5,0x7d,0xcf,0x51,0xf7 + +# ATT: vsqrtnepbf16 %ymm23, %ymm22 +# INTEL: vsqrtnepbf16 ymm22, ymm23 +0x62,0xa5,0x7d,0x28,0x51,0xf7 + +# ATT: vsqrtnepbf16 %ymm23, %ymm22 {%k7} +# INTEL: vsqrtnepbf16 ymm22 {k7}, ymm23 +0x62,0xa5,0x7d,0x2f,0x51,0xf7 + +# ATT: vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, ymm23 +0x62,0xa5,0x7d,0xaf,0x51,0xf7 + +# ATT: vsqrtnepbf16 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%rip){1to8}, %xmm22 +# INTEL: vsqrtnepbf16 xmm22, word ptr [rip]{1to8} +0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsqrtnepbf16 -512(,%rbp,2), %xmm22 +# INTEL: vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vsqrtnepbf16 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f + +# ATT: vsqrtnepbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80 + +# ATT: vsqrtnepbf16 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%rip){1to16}, %ymm22 +# INTEL: vsqrtnepbf16 ymm22, word ptr [rip]{1to16} +0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsqrtnepbf16 -1024(,%rbp,2), %ymm22 +# INTEL: vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vsqrtnepbf16 4064(%rcx), %ymm22 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f + +# ATT: vsqrtnepbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80 + +# ATT: vsqrtnepbf16 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsqrtnepbf16 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsqrtnepbf16 (%rip){1to32}, %zmm22 +# INTEL: vsqrtnepbf16 zmm22, word ptr [rip]{1to32} +0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsqrtnepbf16 -2048(,%rbp,2), %zmm22 +# INTEL: vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vsqrtnepbf16 8128(%rcx), %zmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f + +# ATT: vsqrtnepbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80 + +# ATT: vsubnepbf16 %ymm24, %ymm23, %ymm22 +# INTEL: vsubnepbf16 ymm22, ymm23, ymm24 +0x62,0x85,0x45,0x20,0x5c,0xf0 + +# ATT: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vsubnepbf16 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x45,0x27,0x5c,0xf0 + +# ATT: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x45,0xa7,0x5c,0xf0 + +# ATT: vsubnepbf16 %zmm24, %zmm23, %zmm22 +# INTEL: vsubnepbf16 zmm22, zmm23, zmm24 +0x62,0x85,0x45,0x40,0x5c,0xf0 + +# ATT: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vsubnepbf16 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x45,0x47,0x5c,0xf0 + +# ATT: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x45,0xc7,0x5c,0xf0 + +# ATT: vsubnepbf16 %xmm24, %xmm23, %xmm22 +# INTEL: vsubnepbf16 xmm22, xmm23, xmm24 +0x62,0x85,0x45,0x00,0x5c,0xf0 + +# ATT: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vsubnepbf16 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x45,0x07,0x5c,0xf0 + +# ATT: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x45,0x87,0x5c,0xf0 + +# ATT: vsubnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsubnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vsubnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f + +# ATT: vsubnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80 + +# ATT: vsubnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsubnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vsubnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f + +# ATT: vsubnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80 + +# ATT: vsubnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vsubnepbf16 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00 + +# ATT: vsubnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vsubnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f + +# ATT: vsubnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x45,0x97,0x5c,0x72,0x80 + diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-att.s b/llvm/test/MC/X86/avx10.2-bf16-32-att.s new file mode 100644 index 0000000000000..9f62743177c9b --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-bf16-32-att.s @@ -0,0 +1,3014 @@ +// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s + +// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0xd4] + vaddnepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0xd4] + vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0xd4] + vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0xd4] + vaddnepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0xd4] + vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0xd4] + vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0xd4] + vaddnepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0xd4] + vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0xd4] + vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vaddnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vaddnepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x58,0x10] + vaddnepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vaddnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff] + vaddnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vaddnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f] + vaddnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x58,0x52,0x80] + vaddnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vaddnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vaddnepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x58,0x10] + vaddnepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vaddnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff] + vaddnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vaddnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f] + vaddnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x58,0x52,0x80] + vaddnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vaddnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vaddnepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x58,0x10] + vaddnepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vaddnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff] + vaddnepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vaddnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f] + vaddnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x58,0x52,0x80] + vaddnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcmppbf16 $123, %ymm4, %ymm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b] + vcmppbf16 $123, %ymm4, %ymm3, %k5 + +// CHECK: vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b] + vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, %xmm4, %xmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b] + vcmppbf16 $123, %xmm4, %xmm3, %k5 + +// CHECK: vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b] + vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, %zmm4, %zmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b] + vcmppbf16 $123, %zmm4, %zmm3, %k5 + +// CHECK: vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b] + vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%esp,%esi,8), %zmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%esp,%esi,8), %zmm3, %k5 + +// CHECK: vcmppbf16 $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%eax){1to32}, %zmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b] + vcmppbf16 $123, (%eax){1to32}, %zmm3, %k5 + +// CHECK: vcmppbf16 $123, -2048(,%ebp,2), %zmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vcmppbf16 $123, -2048(,%ebp,2), %zmm3, %k5 + +// CHECK: vcmppbf16 $123, 8128(%ecx), %zmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 8128(%ecx), %zmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%edx){1to32}, %zmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%edx){1to32}, %zmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%esp,%esi,8), %xmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%esp,%esi,8), %xmm3, %k5 + +// CHECK: vcmppbf16 $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%eax){1to8}, %xmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b] + vcmppbf16 $123, (%eax){1to8}, %xmm3, %k5 + +// CHECK: vcmppbf16 $123, -512(,%ebp,2), %xmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vcmppbf16 $123, -512(,%ebp,2), %xmm3, %k5 + +// CHECK: vcmppbf16 $123, 2032(%ecx), %xmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 2032(%ecx), %xmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%edx){1to8}, %xmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%edx){1to8}, %xmm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%esp,%esi,8), %ymm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%esp,%esi,8), %ymm3, %k5 + +// CHECK: vcmppbf16 $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%eax){1to16}, %ymm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b] + vcmppbf16 $123, (%eax){1to16}, %ymm3, %k5 + +// CHECK: vcmppbf16 $123, -1024(,%ebp,2), %ymm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vcmppbf16 $123, -1024(,%ebp,2), %ymm3, %k5 + +// CHECK: vcmppbf16 $123, 4064(%ecx), %ymm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 4064(%ecx), %ymm3, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%edx){1to16}, %ymm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%edx){1to16}, %ymm3, %k5 {%k7} + +// CHECK: vcomsbf16 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xd3] + vcomsbf16 %xmm3, %xmm2 + +// CHECK: vcomsbf16 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomsbf16 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcomsbf16 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomsbf16 291(%edi,%eax,4), %xmm2 + +// CHECK: vcomsbf16 (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x10] + vcomsbf16 (%eax), %xmm2 + +// CHECK: vcomsbf16 -64(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff] + vcomsbf16 -64(,%ebp,2), %xmm2 + +// CHECK: vcomsbf16 254(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f] + vcomsbf16 254(%ecx), %xmm2 + +// CHECK: vcomsbf16 -256(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80] + vcomsbf16 -256(%edx), %xmm2 + +// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0xd4] + vdivnepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0xd4] + vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0xd4] + vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0xd4] + vdivnepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0xd4] + vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0xd4] + vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0xd4] + vdivnepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0xd4] + vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0xd4] + vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vdivnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vdivnepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5e,0x10] + vdivnepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vdivnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vdivnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vdivnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f] + vdivnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80] + vdivnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vdivnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vdivnepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5e,0x10] + vdivnepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vdivnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vdivnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vdivnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f] + vdivnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80] + vdivnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vdivnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vdivnepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5e,0x10] + vdivnepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vdivnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vdivnepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vdivnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f] + vdivnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80] + vdivnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0xd4] + vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0xd4] + vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0xd4] + vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0xd4] + vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0xd4] + vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0xd4] + vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0xd4] + vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0xd4] + vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0xd4] + vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x98,0x10] + vfmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f] + vfmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x98,0x52,0x80] + vfmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x98,0x10] + vfmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f] + vfmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x98,0x52,0x80] + vfmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x98,0x10] + vfmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f] + vfmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x98,0x52,0x80] + vfmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0xd4] + vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0xd4] + vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0xd4] + vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0xd4] + vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0xd4] + vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0xd4] + vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0xd4] + vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0xd4] + vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0xd4] + vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xa8,0x10] + vfmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f] + vfmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80] + vfmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xa8,0x10] + vfmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f] + vfmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80] + vfmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xa8,0x10] + vfmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f] + vfmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80] + vfmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0xd4] + vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0xd4] + vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0xd4] + vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0xd4] + vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0xd4] + vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0xd4] + vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0xd4] + vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0xd4] + vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0xd4] + vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xb8,0x10] + vfmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f] + vfmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80] + vfmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xb8,0x10] + vfmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f] + vfmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80] + vfmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xb8,0x10] + vfmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f] + vfmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80] + vfmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0xd4] + vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0xd4] + vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0xd4] + vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0xd4] + vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0xd4] + vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0xd4] + vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0xd4] + vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0xd4] + vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0xd4] + vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9a,0x10] + vfmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f] + vfmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80] + vfmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9a,0x10] + vfmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f] + vfmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80] + vfmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9a,0x10] + vfmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f] + vfmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80] + vfmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0xd4] + vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0xd4] + vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0xd4] + vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0xd4] + vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0xd4] + vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0xd4] + vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0xd4] + vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0xd4] + vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0xd4] + vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xaa,0x10] + vfmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f] + vfmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80] + vfmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xaa,0x10] + vfmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f] + vfmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80] + vfmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xaa,0x10] + vfmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f] + vfmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80] + vfmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0xd4] + vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0xd4] + vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0xd4] + vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0xd4] + vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0xd4] + vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0xd4] + vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0xd4] + vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0xd4] + vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0xd4] + vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xba,0x10] + vfmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f] + vfmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xba,0x52,0x80] + vfmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xba,0x10] + vfmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f] + vfmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xba,0x52,0x80] + vfmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xba,0x10] + vfmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f] + vfmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xba,0x52,0x80] + vfmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0xd4] + vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0xd4] + vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0xd4] + vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0xd4] + vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0xd4] + vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0xd4] + vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0xd4] + vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0xd4] + vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0xd4] + vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9c,0x10] + vfnmadd132nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f] + vfnmadd132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80] + vfnmadd132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9c,0x10] + vfnmadd132nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f] + vfnmadd132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80] + vfnmadd132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9c,0x10] + vfnmadd132nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f] + vfnmadd132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80] + vfnmadd132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0xd4] + vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0xd4] + vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0xd4] + vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0xd4] + vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0xd4] + vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0xd4] + vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0xd4] + vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0xd4] + vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0xd4] + vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xac,0x10] + vfnmadd213nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f] + vfnmadd213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xac,0x52,0x80] + vfnmadd213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xac,0x10] + vfnmadd213nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f] + vfnmadd213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xac,0x52,0x80] + vfnmadd213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xac,0x10] + vfnmadd213nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f] + vfnmadd213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xac,0x52,0x80] + vfnmadd213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0xd4] + vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0xd4] + vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0xd4] + vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0xd4] + vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0xd4] + vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0xd4] + vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0xd4] + vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0xd4] + vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0xd4] + vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbc,0x10] + vfnmadd231nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f] + vfnmadd231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80] + vfnmadd231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbc,0x10] + vfnmadd231nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f] + vfnmadd231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80] + vfnmadd231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbc,0x10] + vfnmadd231nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f] + vfnmadd231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80] + vfnmadd231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0xd4] + vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0xd4] + vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0xd4] + vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0xd4] + vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0xd4] + vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0xd4] + vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0xd4] + vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0xd4] + vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0xd4] + vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9e,0x10] + vfnmsub132nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub132nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f] + vfnmsub132nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80] + vfnmsub132nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9e,0x10] + vfnmsub132nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub132nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f] + vfnmsub132nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80] + vfnmsub132nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9e,0x10] + vfnmsub132nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub132nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f] + vfnmsub132nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80] + vfnmsub132nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0xd4] + vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0xd4] + vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0xd4] + vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0xd4] + vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0xd4] + vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0xd4] + vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0xd4] + vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0xd4] + vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0xd4] + vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xae,0x10] + vfnmsub213nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub213nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f] + vfnmsub213nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xae,0x52,0x80] + vfnmsub213nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xae,0x10] + vfnmsub213nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub213nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f] + vfnmsub213nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xae,0x52,0x80] + vfnmsub213nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xae,0x10] + vfnmsub213nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub213nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f] + vfnmsub213nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xae,0x52,0x80] + vfnmsub213nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0xd4] + vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0xd4] + vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0xd4] + vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0xd4] + vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0xd4] + vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0xd4] + vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0xd4] + vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0xd4] + vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0xd4] + vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vfnmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vfnmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbe,0x10] + vfnmsub231nepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vfnmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub231nepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vfnmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f] + vfnmsub231nepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80] + vfnmsub231nepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vfnmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vfnmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbe,0x10] + vfnmsub231nepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vfnmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub231nepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vfnmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f] + vfnmsub231nepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80] + vfnmsub231nepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vfnmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vfnmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbe,0x10] + vfnmsub231nepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vfnmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub231nepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vfnmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f] + vfnmsub231nepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80] + vfnmsub231nepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vfpclasspbf16 $123, %zmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %zmm3, %k5 + +// CHECK: vfpclasspbf16 $123, %zmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %zmm3, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, %ymm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %ymm3, %k5 + +// CHECK: vfpclasspbf16 $123, %ymm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %ymm3, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, %xmm3, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %xmm3, %k5 + +// CHECK: vfpclasspbf16 $123, %xmm3, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b] + vfpclasspbf16 $123, %xmm3, %k5 {%k7} + +// CHECK: vfpclasspbf16x $123, 268435456(%esp,%esi,8), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vfpclasspbf16x $123, 268435456(%esp,%esi,8), %k5 + +// CHECK: vfpclasspbf16x $123, 291(%edi,%eax,4), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vfpclasspbf16x $123, 291(%edi,%eax,4), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%eax){1to8}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b] + vfpclasspbf16 $123, (%eax){1to8}, %k5 + +// CHECK: vfpclasspbf16x $123, -512(,%ebp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vfpclasspbf16x $123, -512(,%ebp,2), %k5 + +// CHECK: vfpclasspbf16x $123, 2032(%ecx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16x $123, 2032(%ecx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%edx){1to8}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%edx){1to8}, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%eax){1to16}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b] + vfpclasspbf16 $123, (%eax){1to16}, %k5 + +// CHECK: vfpclasspbf16y $123, -1024(,%ebp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vfpclasspbf16y $123, -1024(,%ebp,2), %k5 + +// CHECK: vfpclasspbf16y $123, 4064(%ecx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16y $123, 4064(%ecx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%edx){1to16}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%edx){1to16}, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%eax){1to32}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b] + vfpclasspbf16 $123, (%eax){1to32}, %k5 + +// CHECK: vfpclasspbf16z $123, -2048(,%ebp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vfpclasspbf16z $123, -2048(,%ebp,2), %k5 + +// CHECK: vfpclasspbf16z $123, 8128(%ecx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16z $123, 8128(%ecx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%edx){1to32}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%edx){1to32}, %k5 {%k7} + +// CHECK: vgetexppbf16 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3] + vgetexppbf16 %xmm3, %xmm2 + +// CHECK: vgetexppbf16 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3] + vgetexppbf16 %xmm3, %xmm2 {%k7} + +// CHECK: vgetexppbf16 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3] + vgetexppbf16 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vgetexppbf16 %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3] + vgetexppbf16 %zmm3, %zmm2 + +// CHECK: vgetexppbf16 %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3] + vgetexppbf16 %zmm3, %zmm2 {%k7} + +// CHECK: vgetexppbf16 %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3] + vgetexppbf16 %zmm3, %zmm2 {%k7} {z} + +// CHECK: vgetexppbf16 %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3] + vgetexppbf16 %ymm3, %ymm2 + +// CHECK: vgetexppbf16 %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3] + vgetexppbf16 %ymm3, %ymm2 {%k7} + +// CHECK: vgetexppbf16 %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3] + vgetexppbf16 %ymm3, %ymm2 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vgetexppbf16 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vgetexppbf16 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10] + vgetexppbf16 (%eax){1to8}, %xmm2 + +// CHECK: vgetexppbf16 -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff] + vgetexppbf16 -512(,%ebp,2), %xmm2 + +// CHECK: vgetexppbf16 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f] + vgetexppbf16 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80] + vgetexppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vgetexppbf16 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vgetexppbf16 (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10] + vgetexppbf16 (%eax){1to16}, %ymm2 + +// CHECK: vgetexppbf16 -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff] + vgetexppbf16 -1024(,%ebp,2), %ymm2 + +// CHECK: vgetexppbf16 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f] + vgetexppbf16 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80] + vgetexppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vgetexppbf16 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vgetexppbf16 (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10] + vgetexppbf16 (%eax){1to32}, %zmm2 + +// CHECK: vgetexppbf16 -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff] + vgetexppbf16 -2048(,%ebp,2), %zmm2 + +// CHECK: vgetexppbf16 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f] + vgetexppbf16 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80] + vgetexppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %zmm3, %zmm2 + +// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} + +// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %ymm3, %ymm2 + +// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} + +// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %xmm3, %xmm2 + +// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} + +// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b] + vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vgetmantpbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vgetmantpbf16 $123, (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b] + vgetmantpbf16 $123, (%eax){1to8}, %xmm2 + +// CHECK: vgetmantpbf16 $123, -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vgetmantpbf16 $123, -512(,%ebp,2), %xmm2 + +// CHECK: vgetmantpbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b] + vgetmantpbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vgetmantpbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vgetmantpbf16 $123, (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b] + vgetmantpbf16 $123, (%eax){1to16}, %ymm2 + +// CHECK: vgetmantpbf16 $123, -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vgetmantpbf16 $123, -1024(,%ebp,2), %ymm2 + +// CHECK: vgetmantpbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b] + vgetmantpbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vgetmantpbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vgetmantpbf16 $123, (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b] + vgetmantpbf16 $123, (%eax){1to32}, %zmm2 + +// CHECK: vgetmantpbf16 $123, -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vgetmantpbf16 $123, -2048(,%ebp,2), %zmm2 + +// CHECK: vgetmantpbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b] + vgetmantpbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0xd4] + vmaxpbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0xd4] + vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0xd4] + vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0xd4] + vmaxpbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0xd4] + vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0xd4] + vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0xd4] + vmaxpbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0xd4] + vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0xd4] + vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vmaxpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vmaxpbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5f,0x10] + vmaxpbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vmaxpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff] + vmaxpbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vmaxpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f] + vmaxpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80] + vmaxpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vmaxpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vmaxpbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5f,0x10] + vmaxpbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vmaxpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff] + vmaxpbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vmaxpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f] + vmaxpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80] + vmaxpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vmaxpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vmaxpbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5f,0x10] + vmaxpbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vmaxpbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff] + vmaxpbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vmaxpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f] + vmaxpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80] + vmaxpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0xd4] + vminpbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0xd4] + vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0xd4] + vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0xd4] + vminpbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0xd4] + vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0xd4] + vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0xd4] + vminpbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0xd4] + vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0xd4] + vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vminpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vminpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vminpbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5d,0x10] + vminpbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vminpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff] + vminpbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vminpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f] + vminpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vminpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80] + vminpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vminpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vminpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vminpbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5d,0x10] + vminpbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vminpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff] + vminpbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vminpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f] + vminpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vminpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80] + vminpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vminpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vminpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vminpbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5d,0x10] + vminpbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vminpbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff] + vminpbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vminpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f] + vminpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vminpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80] + vminpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0xd4] + vmulnepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0xd4] + vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0xd4] + vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0xd4] + vmulnepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0xd4] + vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0xd4] + vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0xd4] + vmulnepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0xd4] + vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0xd4] + vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vmulnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vmulnepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x59,0x10] + vmulnepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vmulnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff] + vmulnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vmulnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f] + vmulnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x59,0x52,0x80] + vmulnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vmulnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vmulnepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x59,0x10] + vmulnepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vmulnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff] + vmulnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vmulnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f] + vmulnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x59,0x52,0x80] + vmulnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vmulnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vmulnepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x59,0x10] + vmulnepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vmulnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff] + vmulnepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vmulnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f] + vmulnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x59,0x52,0x80] + vmulnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vrcppbf16 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0xd3] + vrcppbf16 %xmm3, %xmm2 + +// CHECK: vrcppbf16 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0xd3] + vrcppbf16 %xmm3, %xmm2 {%k7} + +// CHECK: vrcppbf16 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0xd3] + vrcppbf16 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vrcppbf16 %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0xd3] + vrcppbf16 %zmm3, %zmm2 + +// CHECK: vrcppbf16 %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0xd3] + vrcppbf16 %zmm3, %zmm2 {%k7} + +// CHECK: vrcppbf16 %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0xd3] + vrcppbf16 %zmm3, %zmm2 {%k7} {z} + +// CHECK: vrcppbf16 %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0xd3] + vrcppbf16 %ymm3, %ymm2 + +// CHECK: vrcppbf16 %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0xd3] + vrcppbf16 %ymm3, %ymm2 {%k7} + +// CHECK: vrcppbf16 %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0xd3] + vrcppbf16 %ymm3, %ymm2 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vrcppbf16 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vrcppbf16 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4c,0x10] + vrcppbf16 (%eax){1to8}, %xmm2 + +// CHECK: vrcppbf16 -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vrcppbf16 -512(,%ebp,2), %xmm2 + +// CHECK: vrcppbf16 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f] + vrcppbf16 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vrcppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80] + vrcppbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vrcppbf16 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vrcppbf16 (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4c,0x10] + vrcppbf16 (%eax){1to16}, %ymm2 + +// CHECK: vrcppbf16 -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vrcppbf16 -1024(,%ebp,2), %ymm2 + +// CHECK: vrcppbf16 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f] + vrcppbf16 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vrcppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80] + vrcppbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vrcppbf16 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vrcppbf16 (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4c,0x10] + vrcppbf16 (%eax){1to32}, %zmm2 + +// CHECK: vrcppbf16 -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vrcppbf16 -2048(,%ebp,2), %zmm2 + +// CHECK: vrcppbf16 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f] + vrcppbf16 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vrcppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80] + vrcppbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b] + vreducenepbf16 $123, %zmm3, %zmm2 + +// CHECK: vreducenepbf16 $123, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b] + vreducenepbf16 $123, %zmm3, %zmm2 {%k7} + +// CHECK: vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b] + vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b] + vreducenepbf16 $123, %ymm3, %ymm2 + +// CHECK: vreducenepbf16 $123, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b] + vreducenepbf16 $123, %ymm3, %ymm2 {%k7} + +// CHECK: vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b] + vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b] + vreducenepbf16 $123, %xmm3, %xmm2 + +// CHECK: vreducenepbf16 $123, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b] + vreducenepbf16 $123, %xmm3, %xmm2 {%k7} + +// CHECK: vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b] + vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vreducenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vreducenepbf16 $123, (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b] + vreducenepbf16 $123, (%eax){1to8}, %xmm2 + +// CHECK: vreducenepbf16 $123, -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vreducenepbf16 $123, -512(,%ebp,2), %xmm2 + +// CHECK: vreducenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b] + vreducenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b] + vreducenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vreducenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vreducenepbf16 $123, (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b] + vreducenepbf16 $123, (%eax){1to16}, %ymm2 + +// CHECK: vreducenepbf16 $123, -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vreducenepbf16 $123, -1024(,%ebp,2), %ymm2 + +// CHECK: vreducenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b] + vreducenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b] + vreducenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vreducenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vreducenepbf16 $123, (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b] + vreducenepbf16 $123, (%eax){1to32}, %zmm2 + +// CHECK: vreducenepbf16 $123, -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vreducenepbf16 $123, -2048(,%ebp,2), %zmm2 + +// CHECK: vreducenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b] + vreducenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b] + vreducenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %zmm3, %zmm2 + +// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %ymm3, %ymm2 + +// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %xmm3, %xmm2 + +// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b] + vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vrndscalenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b] + vrndscalenepbf16 $123, (%eax){1to8}, %xmm2 + +// CHECK: vrndscalenepbf16 $123, -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -512(,%ebp,2), %xmm2 + +// CHECK: vrndscalenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 $123, 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 $123, -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vrndscalenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b] + vrndscalenepbf16 $123, (%eax){1to16}, %ymm2 + +// CHECK: vrndscalenepbf16 $123, -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -1024(,%ebp,2), %ymm2 + +// CHECK: vrndscalenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 $123, 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 $123, -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vrndscalenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b] + vrndscalenepbf16 $123, (%eax){1to32}, %zmm2 + +// CHECK: vrndscalenepbf16 $123, -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -2048(,%ebp,2), %zmm2 + +// CHECK: vrndscalenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 $123, 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 $123, -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xd3] + vrsqrtpbf16 %xmm3, %xmm2 + +// CHECK: vrsqrtpbf16 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0xd3] + vrsqrtpbf16 %xmm3, %xmm2 {%k7} + +// CHECK: vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0xd3] + vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xd3] + vrsqrtpbf16 %zmm3, %zmm2 + +// CHECK: vrsqrtpbf16 %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0xd3] + vrsqrtpbf16 %zmm3, %zmm2 {%k7} + +// CHECK: vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0xd3] + vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xd3] + vrsqrtpbf16 %ymm3, %ymm2 + +// CHECK: vrsqrtpbf16 %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0xd3] + vrsqrtpbf16 %ymm3, %ymm2 {%k7} + +// CHECK: vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0xd3] + vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vrsqrtpbf16 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vrsqrtpbf16 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4e,0x10] + vrsqrtpbf16 (%eax){1to8}, %xmm2 + +// CHECK: vrsqrtpbf16 -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vrsqrtpbf16 -512(,%ebp,2), %xmm2 + +// CHECK: vrsqrtpbf16 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f] + vrsqrtpbf16 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80] + vrsqrtpbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vrsqrtpbf16 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vrsqrtpbf16 (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4e,0x10] + vrsqrtpbf16 (%eax){1to16}, %ymm2 + +// CHECK: vrsqrtpbf16 -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vrsqrtpbf16 -1024(,%ebp,2), %ymm2 + +// CHECK: vrsqrtpbf16 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f] + vrsqrtpbf16 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80] + vrsqrtpbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vrsqrtpbf16 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vrsqrtpbf16 (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4e,0x10] + vrsqrtpbf16 (%eax){1to32}, %zmm2 + +// CHECK: vrsqrtpbf16 -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vrsqrtpbf16 -2048(,%ebp,2), %zmm2 + +// CHECK: vrsqrtpbf16 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f] + vrsqrtpbf16 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80] + vrsqrtpbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0xd4] + vscalefpbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0xd4] + vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0xd4] + vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0xd4] + vscalefpbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0xd4] + vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0xd4] + vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0xd4] + vscalefpbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0xd4] + vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0xd4] + vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vscalefpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vscalefpbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x2c,0x10] + vscalefpbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vscalefpbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vscalefpbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vscalefpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f] + vscalefpbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80] + vscalefpbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vscalefpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vscalefpbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x2c,0x10] + vscalefpbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vscalefpbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vscalefpbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vscalefpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f] + vscalefpbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80] + vscalefpbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vscalefpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vscalefpbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x2c,0x10] + vscalefpbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vscalefpbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vscalefpbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vscalefpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f] + vscalefpbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80] + vscalefpbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0xd3] + vsqrtnepbf16 %xmm3, %xmm2 + +// CHECK: vsqrtnepbf16 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0xd3] + vsqrtnepbf16 %xmm3, %xmm2 {%k7} + +// CHECK: vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0xd3] + vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0xd3] + vsqrtnepbf16 %zmm3, %zmm2 + +// CHECK: vsqrtnepbf16 %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0xd3] + vsqrtnepbf16 %zmm3, %zmm2 {%k7} + +// CHECK: vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0xd3] + vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0xd3] + vsqrtnepbf16 %ymm3, %ymm2 + +// CHECK: vsqrtnepbf16 %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0xd3] + vsqrtnepbf16 %ymm3, %ymm2 {%k7} + +// CHECK: vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0xd3] + vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vsqrtnepbf16 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vsqrtnepbf16 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x51,0x10] + vsqrtnepbf16 (%eax){1to8}, %xmm2 + +// CHECK: vsqrtnepbf16 -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vsqrtnepbf16 -512(,%ebp,2), %xmm2 + +// CHECK: vsqrtnepbf16 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f] + vsqrtnepbf16 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80] + vsqrtnepbf16 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vsqrtnepbf16 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vsqrtnepbf16 (%eax){1to16}, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x51,0x10] + vsqrtnepbf16 (%eax){1to16}, %ymm2 + +// CHECK: vsqrtnepbf16 -1024(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vsqrtnepbf16 -1024(,%ebp,2), %ymm2 + +// CHECK: vsqrtnepbf16 4064(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f] + vsqrtnepbf16 4064(%ecx), %ymm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80] + vsqrtnepbf16 -256(%edx){1to16}, %ymm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vsqrtnepbf16 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vsqrtnepbf16 (%eax){1to32}, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x51,0x10] + vsqrtnepbf16 (%eax){1to32}, %zmm2 + +// CHECK: vsqrtnepbf16 -2048(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff] + vsqrtnepbf16 -2048(,%ebp,2), %zmm2 + +// CHECK: vsqrtnepbf16 8128(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f] + vsqrtnepbf16 8128(%ecx), %zmm2 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80] + vsqrtnepbf16 -256(%edx){1to32}, %zmm2 {%k7} {z} + +// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0xd4] + vsubnepbf16 %ymm4, %ymm3, %ymm2 + +// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0xd4] + vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0xd4] + vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0xd4] + vsubnepbf16 %zmm4, %zmm3, %zmm2 + +// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0xd4] + vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0xd4] + vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0xd4] + vsubnepbf16 %xmm4, %xmm3, %xmm2 + +// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0xd4] + vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0xd4] + vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vsubnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vsubnepbf16 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5c,0x10] + vsubnepbf16 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vsubnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vsubnepbf16 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vsubnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f] + vsubnepbf16 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80] + vsubnepbf16 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vsubnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vsubnepbf16 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5c,0x10] + vsubnepbf16 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vsubnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vsubnepbf16 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vsubnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f] + vsubnepbf16 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80] + vsubnepbf16 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vsubnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vsubnepbf16 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5c,0x10] + vsubnepbf16 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vsubnepbf16 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vsubnepbf16 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vsubnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f] + vsubnepbf16 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80] + vsubnepbf16 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-intel.s b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s new file mode 100644 index 0000000000000..30c2cf45297bc --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s @@ -0,0 +1,3014 @@ +// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vaddnepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0xd4] + vaddnepbf16 ymm2, ymm3, ymm4 + +// CHECK: vaddnepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0xd4] + vaddnepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0xd4] + vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vaddnepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0xd4] + vaddnepbf16 zmm2, zmm3, zmm4 + +// CHECK: vaddnepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0xd4] + vaddnepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0xd4] + vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vaddnepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0xd4] + vaddnepbf16 xmm2, xmm3, xmm4 + +// CHECK: vaddnepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0xd4] + vaddnepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0xd4] + vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x58,0x10] + vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff] + vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f] + vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x58,0x52,0x80] + vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x58,0x10] + vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff] + vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f] + vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x58,0x52,0x80] + vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10] + vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00] + vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x58,0x10] + vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff] + vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f] + vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x58,0x52,0x80] + vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcmppbf16 k5, ymm3, ymm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b] + vcmppbf16 k5, ymm3, ymm4, 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm3, ymm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b] + vcmppbf16 k5 {k7}, ymm3, ymm4, 123 + +// CHECK: vcmppbf16 k5, xmm3, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b] + vcmppbf16 k5, xmm3, xmm4, 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm3, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b] + vcmppbf16 k5 {k7}, xmm3, xmm4, 123 + +// CHECK: vcmppbf16 k5, zmm3, zmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b] + vcmppbf16 k5, zmm3, zmm4, 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm3, zmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b] + vcmppbf16 k5 {k7}, zmm3, zmm4, 123 + +// CHECK: vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b] + vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123 + +// CHECK: vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b] + vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123 + +// CHECK: vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123 + +// CHECK: vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b] + vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123 + +// CHECK: vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123 + +// CHECK: vcomsbf16 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xd3] + vcomsbf16 xmm2, xmm3 + +// CHECK: vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcomsbf16 xmm2, word ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomsbf16 xmm2, word ptr [edi + 4*eax + 291] + +// CHECK: vcomsbf16 xmm2, word ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x10] + vcomsbf16 xmm2, word ptr [eax] + +// CHECK: vcomsbf16 xmm2, word ptr [2*ebp - 64] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff] + vcomsbf16 xmm2, word ptr [2*ebp - 64] + +// CHECK: vcomsbf16 xmm2, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f] + vcomsbf16 xmm2, word ptr [ecx + 254] + +// CHECK: vcomsbf16 xmm2, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80] + vcomsbf16 xmm2, word ptr [edx - 256] + +// CHECK: vdivnepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0xd4] + vdivnepbf16 ymm2, ymm3, ymm4 + +// CHECK: vdivnepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0xd4] + vdivnepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0xd4] + vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vdivnepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0xd4] + vdivnepbf16 zmm2, zmm3, zmm4 + +// CHECK: vdivnepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0xd4] + vdivnepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0xd4] + vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vdivnepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0xd4] + vdivnepbf16 xmm2, xmm3, xmm4 + +// CHECK: vdivnepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0xd4] + vdivnepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0xd4] + vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5e,0x10] + vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f] + vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80] + vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5e,0x10] + vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f] + vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80] + vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10] + vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00] + vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5e,0x10] + vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f] + vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80] + vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0xd4] + vfmadd132nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0xd4] + vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0xd4] + vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0xd4] + vfmadd132nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0xd4] + vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0xd4] + vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0xd4] + vfmadd132nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0xd4] + vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0xd4] + vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x98,0x10] + vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f] + vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x98,0x52,0x80] + vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x98,0x10] + vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f] + vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x98,0x52,0x80] + vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x98,0x10] + vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f] + vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x98,0x52,0x80] + vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0xd4] + vfmadd213nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0xd4] + vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0xd4] + vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0xd4] + vfmadd213nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0xd4] + vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0xd4] + vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0xd4] + vfmadd213nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0xd4] + vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0xd4] + vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xa8,0x10] + vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f] + vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80] + vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xa8,0x10] + vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f] + vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80] + vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xa8,0x10] + vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f] + vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80] + vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0xd4] + vfmadd231nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0xd4] + vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0xd4] + vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0xd4] + vfmadd231nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0xd4] + vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0xd4] + vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0xd4] + vfmadd231nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0xd4] + vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0xd4] + vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xb8,0x10] + vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f] + vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80] + vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xb8,0x10] + vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f] + vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80] + vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xb8,0x10] + vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f] + vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80] + vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0xd4] + vfmsub132nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0xd4] + vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0xd4] + vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0xd4] + vfmsub132nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0xd4] + vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0xd4] + vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0xd4] + vfmsub132nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0xd4] + vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0xd4] + vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9a,0x10] + vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f] + vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80] + vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9a,0x10] + vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f] + vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80] + vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9a,0x10] + vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f] + vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80] + vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0xd4] + vfmsub213nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0xd4] + vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0xd4] + vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0xd4] + vfmsub213nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0xd4] + vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0xd4] + vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0xd4] + vfmsub213nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0xd4] + vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0xd4] + vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xaa,0x10] + vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f] + vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80] + vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xaa,0x10] + vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f] + vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80] + vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xaa,0x10] + vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f] + vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80] + vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0xd4] + vfmsub231nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0xd4] + vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0xd4] + vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0xd4] + vfmsub231nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0xd4] + vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0xd4] + vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0xd4] + vfmsub231nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0xd4] + vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0xd4] + vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xba,0x10] + vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f] + vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xba,0x52,0x80] + vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xba,0x10] + vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f] + vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xba,0x52,0x80] + vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xba,0x10] + vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f] + vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xba,0x52,0x80] + vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0xd4] + vfnmadd132nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0xd4] + vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0xd4] + vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0xd4] + vfnmadd132nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0xd4] + vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0xd4] + vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0xd4] + vfnmadd132nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0xd4] + vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0xd4] + vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9c,0x10] + vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f] + vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80] + vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9c,0x10] + vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f] + vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80] + vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9c,0x10] + vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f] + vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80] + vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0xd4] + vfnmadd213nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0xd4] + vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0xd4] + vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0xd4] + vfnmadd213nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0xd4] + vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0xd4] + vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0xd4] + vfnmadd213nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0xd4] + vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0xd4] + vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xac,0x10] + vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f] + vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xac,0x52,0x80] + vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xac,0x10] + vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f] + vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xac,0x52,0x80] + vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xac,0x10] + vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f] + vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xac,0x52,0x80] + vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0xd4] + vfnmadd231nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0xd4] + vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0xd4] + vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0xd4] + vfnmadd231nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0xd4] + vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0xd4] + vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0xd4] + vfnmadd231nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0xd4] + vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0xd4] + vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbc,0x10] + vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f] + vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80] + vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbc,0x10] + vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f] + vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80] + vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbc,0x10] + vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f] + vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80] + vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0xd4] + vfnmsub132nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0xd4] + vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0xd4] + vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0xd4] + vfnmsub132nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0xd4] + vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0xd4] + vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0xd4] + vfnmsub132nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0xd4] + vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0xd4] + vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9e,0x10] + vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f] + vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80] + vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9e,0x10] + vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f] + vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80] + vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9e,0x10] + vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f] + vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80] + vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0xd4] + vfnmsub213nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0xd4] + vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0xd4] + vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0xd4] + vfnmsub213nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0xd4] + vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0xd4] + vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0xd4] + vfnmsub213nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0xd4] + vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0xd4] + vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xae,0x10] + vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f] + vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xae,0x52,0x80] + vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xae,0x10] + vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f] + vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xae,0x52,0x80] + vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xae,0x10] + vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f] + vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xae,0x52,0x80] + vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0xd4] + vfnmsub231nepbf16 ymm2, ymm3, ymm4 + +// CHECK: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0xd4] + vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0xd4] + vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0xd4] + vfnmsub231nepbf16 zmm2, zmm3, zmm4 + +// CHECK: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0xd4] + vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0xd4] + vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0xd4] + vfnmsub231nepbf16 xmm2, xmm3, xmm4 + +// CHECK: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0xd4] + vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0xd4] + vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbe,0x10] + vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f] + vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80] + vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbe,0x10] + vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f] + vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80] + vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbe,0x10] + vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f] + vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80] + vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vfpclasspbf16 k5, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b] + vfpclasspbf16 k5, zmm3, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b] + vfpclasspbf16 k5 {k7}, zmm3, 123 + +// CHECK: vfpclasspbf16 k5, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b] + vfpclasspbf16 k5, ymm3, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b] + vfpclasspbf16 k5 {k7}, ymm3, 123 + +// CHECK: vfpclasspbf16 k5, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b] + vfpclasspbf16 k5, xmm3, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b] + vfpclasspbf16 k5 {k7}, xmm3, 123 + +// CHECK: vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b] + vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b] + vfpclasspbf16 k5, word ptr [eax]{1to8}, 123 + +// CHECK: vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123 + +// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b] + vfpclasspbf16 k5, word ptr [eax]{1to16}, 123 + +// CHECK: vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123 + +// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b] + vfpclasspbf16 k5, word ptr [eax]{1to32}, 123 + +// CHECK: vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vgetexppbf16 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3] + vgetexppbf16 xmm2, xmm3 + +// CHECK: vgetexppbf16 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3] + vgetexppbf16 xmm2 {k7}, xmm3 + +// CHECK: vgetexppbf16 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3] + vgetexppbf16 xmm2 {k7} {z}, xmm3 + +// CHECK: vgetexppbf16 zmm2, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3] + vgetexppbf16 zmm2, zmm3 + +// CHECK: vgetexppbf16 zmm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3] + vgetexppbf16 zmm2 {k7}, zmm3 + +// CHECK: vgetexppbf16 zmm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3] + vgetexppbf16 zmm2 {k7} {z}, zmm3 + +// CHECK: vgetexppbf16 ymm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3] + vgetexppbf16 ymm2, ymm3 + +// CHECK: vgetexppbf16 ymm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3] + vgetexppbf16 ymm2 {k7}, ymm3 + +// CHECK: vgetexppbf16 ymm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3] + vgetexppbf16 ymm2 {k7} {z}, ymm3 + +// CHECK: vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vgetexppbf16 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10] + vgetexppbf16 xmm2, word ptr [eax]{1to8} + +// CHECK: vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff] + vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f] + vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80] + vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vgetexppbf16 ymm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10] + vgetexppbf16 ymm2, word ptr [eax]{1to16} + +// CHECK: vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff] + vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f] + vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80] + vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10] + vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00] + vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vgetexppbf16 zmm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10] + vgetexppbf16 zmm2, word ptr [eax]{1to32} + +// CHECK: vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff] + vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f] + vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80] + vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vgetmantpbf16 zmm2, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b] + vgetmantpbf16 zmm2, zmm3, 123 + +// CHECK: vgetmantpbf16 zmm2 {k7}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b] + vgetmantpbf16 zmm2 {k7}, zmm3, 123 + +// CHECK: vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b] + vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123 + +// CHECK: vgetmantpbf16 ymm2, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b] + vgetmantpbf16 ymm2, ymm3, 123 + +// CHECK: vgetmantpbf16 ymm2 {k7}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b] + vgetmantpbf16 ymm2 {k7}, ymm3, 123 + +// CHECK: vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b] + vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123 + +// CHECK: vgetmantpbf16 xmm2, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b] + vgetmantpbf16 xmm2, xmm3, 123 + +// CHECK: vgetmantpbf16 xmm2 {k7}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b] + vgetmantpbf16 xmm2 {k7}, xmm3, 123 + +// CHECK: vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b] + vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123 + +// CHECK: vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b] + vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123 + +// CHECK: vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123 + +// CHECK: vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 + +// CHECK: vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b] + vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 + +// CHECK: vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b] + vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123 + +// CHECK: vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 + +// CHECK: vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 + +// CHECK: vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b] + vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 + +// CHECK: vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b] + vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123 + +// CHECK: vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 + +// CHECK: vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b] + vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 + +// CHECK: vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b] + vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vmaxpbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0xd4] + vmaxpbf16 ymm2, ymm3, ymm4 + +// CHECK: vmaxpbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0xd4] + vmaxpbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0xd4] + vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vmaxpbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0xd4] + vmaxpbf16 zmm2, zmm3, zmm4 + +// CHECK: vmaxpbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0xd4] + vmaxpbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0xd4] + vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vmaxpbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0xd4] + vmaxpbf16 xmm2, xmm3, xmm4 + +// CHECK: vmaxpbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0xd4] + vmaxpbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0xd4] + vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5f,0x10] + vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff] + vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f] + vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80] + vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5f,0x10] + vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff] + vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f] + vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80] + vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10] + vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00] + vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5f,0x10] + vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff] + vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f] + vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80] + vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vminpbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0xd4] + vminpbf16 ymm2, ymm3, ymm4 + +// CHECK: vminpbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0xd4] + vminpbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0xd4] + vminpbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vminpbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0xd4] + vminpbf16 zmm2, zmm3, zmm4 + +// CHECK: vminpbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0xd4] + vminpbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0xd4] + vminpbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vminpbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0xd4] + vminpbf16 xmm2, xmm3, xmm4 + +// CHECK: vminpbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0xd4] + vminpbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0xd4] + vminpbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vminpbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5d,0x10] + vminpbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff] + vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f] + vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80] + vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vminpbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5d,0x10] + vminpbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff] + vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f] + vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80] + vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10] + vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00] + vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vminpbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5d,0x10] + vminpbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff] + vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f] + vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80] + vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vmulnepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0xd4] + vmulnepbf16 ymm2, ymm3, ymm4 + +// CHECK: vmulnepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0xd4] + vmulnepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0xd4] + vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vmulnepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0xd4] + vmulnepbf16 zmm2, zmm3, zmm4 + +// CHECK: vmulnepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0xd4] + vmulnepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0xd4] + vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vmulnepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0xd4] + vmulnepbf16 xmm2, xmm3, xmm4 + +// CHECK: vmulnepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0xd4] + vmulnepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0xd4] + vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x59,0x10] + vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff] + vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f] + vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x59,0x52,0x80] + vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x59,0x10] + vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff] + vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f] + vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x59,0x52,0x80] + vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10] + vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00] + vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x59,0x10] + vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff] + vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f] + vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x59,0x52,0x80] + vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vrcppbf16 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0xd3] + vrcppbf16 xmm2, xmm3 + +// CHECK: vrcppbf16 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0xd3] + vrcppbf16 xmm2 {k7}, xmm3 + +// CHECK: vrcppbf16 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0xd3] + vrcppbf16 xmm2 {k7} {z}, xmm3 + +// CHECK: vrcppbf16 zmm2, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0xd3] + vrcppbf16 zmm2, zmm3 + +// CHECK: vrcppbf16 zmm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0xd3] + vrcppbf16 zmm2 {k7}, zmm3 + +// CHECK: vrcppbf16 zmm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0xd3] + vrcppbf16 zmm2 {k7} {z}, zmm3 + +// CHECK: vrcppbf16 ymm2, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0xd3] + vrcppbf16 ymm2, ymm3 + +// CHECK: vrcppbf16 ymm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0xd3] + vrcppbf16 ymm2 {k7}, ymm3 + +// CHECK: vrcppbf16 ymm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0xd3] + vrcppbf16 ymm2 {k7} {z}, ymm3 + +// CHECK: vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vrcppbf16 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4c,0x10] + vrcppbf16 xmm2, word ptr [eax]{1to8} + +// CHECK: vrcppbf16 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vrcppbf16 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f] + vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80] + vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vrcppbf16 ymm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4c,0x10] + vrcppbf16 ymm2, word ptr [eax]{1to16} + +// CHECK: vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f] + vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80] + vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10] + vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00] + vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vrcppbf16 zmm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4c,0x10] + vrcppbf16 zmm2, word ptr [eax]{1to32} + +// CHECK: vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f] + vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80] + vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vreducenepbf16 zmm2, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b] + vreducenepbf16 zmm2, zmm3, 123 + +// CHECK: vreducenepbf16 zmm2 {k7}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b] + vreducenepbf16 zmm2 {k7}, zmm3, 123 + +// CHECK: vreducenepbf16 zmm2 {k7} {z}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b] + vreducenepbf16 zmm2 {k7} {z}, zmm3, 123 + +// CHECK: vreducenepbf16 ymm2, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b] + vreducenepbf16 ymm2, ymm3, 123 + +// CHECK: vreducenepbf16 ymm2 {k7}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b] + vreducenepbf16 ymm2 {k7}, ymm3, 123 + +// CHECK: vreducenepbf16 ymm2 {k7} {z}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b] + vreducenepbf16 ymm2 {k7} {z}, ymm3, 123 + +// CHECK: vreducenepbf16 xmm2, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b] + vreducenepbf16 xmm2, xmm3, 123 + +// CHECK: vreducenepbf16 xmm2 {k7}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b] + vreducenepbf16 xmm2 {k7}, xmm3, 123 + +// CHECK: vreducenepbf16 xmm2 {k7} {z}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b] + vreducenepbf16 xmm2 {k7} {z}, xmm3, 123 + +// CHECK: vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b] + vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123 + +// CHECK: vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 + +// CHECK: vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b] + vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 + +// CHECK: vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b] + vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 + +// CHECK: vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b] + vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123 + +// CHECK: vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 + +// CHECK: vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b] + vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 + +// CHECK: vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b] + vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 + +// CHECK: vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b] + vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123 + +// CHECK: vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 + +// CHECK: vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b] + vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 + +// CHECK: vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b] + vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vrndscalenepbf16 zmm2, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b] + vrndscalenepbf16 zmm2, zmm3, 123 + +// CHECK: vrndscalenepbf16 zmm2 {k7}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b] + vrndscalenepbf16 zmm2 {k7}, zmm3, 123 + +// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b] + vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123 + +// CHECK: vrndscalenepbf16 ymm2, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b] + vrndscalenepbf16 ymm2, ymm3, 123 + +// CHECK: vrndscalenepbf16 ymm2 {k7}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b] + vrndscalenepbf16 ymm2 {k7}, ymm3, 123 + +// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b] + vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123 + +// CHECK: vrndscalenepbf16 xmm2, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b] + vrndscalenepbf16 xmm2, xmm3, 123 + +// CHECK: vrndscalenepbf16 xmm2 {k7}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b] + vrndscalenepbf16 xmm2 {k7}, xmm3, 123 + +// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b] + vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123 + +// CHECK: vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b] + vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123 + +// CHECK: vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123 + +// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123 + +// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123 + +// CHECK: vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b] + vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123 + +// CHECK: vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123 + +// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123 + +// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123 + +// CHECK: vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123 + +// CHECK: vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b] + vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123 + +// CHECK: vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123 + +// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b] + vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123 + +// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b] + vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vrsqrtpbf16 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xd3] + vrsqrtpbf16 xmm2, xmm3 + +// CHECK: vrsqrtpbf16 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0xd3] + vrsqrtpbf16 xmm2 {k7}, xmm3 + +// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0xd3] + vrsqrtpbf16 xmm2 {k7} {z}, xmm3 + +// CHECK: vrsqrtpbf16 zmm2, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xd3] + vrsqrtpbf16 zmm2, zmm3 + +// CHECK: vrsqrtpbf16 zmm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0xd3] + vrsqrtpbf16 zmm2 {k7}, zmm3 + +// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0xd3] + vrsqrtpbf16 zmm2 {k7} {z}, zmm3 + +// CHECK: vrsqrtpbf16 ymm2, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xd3] + vrsqrtpbf16 ymm2, ymm3 + +// CHECK: vrsqrtpbf16 ymm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0xd3] + vrsqrtpbf16 ymm2 {k7}, ymm3 + +// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0xd3] + vrsqrtpbf16 ymm2 {k7} {z}, ymm3 + +// CHECK: vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vrsqrtpbf16 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4e,0x10] + vrsqrtpbf16 xmm2, word ptr [eax]{1to8} + +// CHECK: vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f] + vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80] + vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vrsqrtpbf16 ymm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4e,0x10] + vrsqrtpbf16 ymm2, word ptr [eax]{1to16} + +// CHECK: vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f] + vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80] + vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10] + vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00] + vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vrsqrtpbf16 zmm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4e,0x10] + vrsqrtpbf16 zmm2, word ptr [eax]{1to32} + +// CHECK: vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff] + vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f] + vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80] + vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vscalefpbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0xd4] + vscalefpbf16 ymm2, ymm3, ymm4 + +// CHECK: vscalefpbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0xd4] + vscalefpbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0xd4] + vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vscalefpbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0xd4] + vscalefpbf16 zmm2, zmm3, zmm4 + +// CHECK: vscalefpbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0xd4] + vscalefpbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0xd4] + vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vscalefpbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0xd4] + vscalefpbf16 xmm2, xmm3, xmm4 + +// CHECK: vscalefpbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0xd4] + vscalefpbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0xd4] + vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x2c,0x10] + vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f] + vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80] + vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x2c,0x10] + vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f] + vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80] + vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00] + vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x2c,0x10] + vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f] + vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80] + vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vsqrtnepbf16 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0xd3] + vsqrtnepbf16 xmm2, xmm3 + +// CHECK: vsqrtnepbf16 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0xd3] + vsqrtnepbf16 xmm2 {k7}, xmm3 + +// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0xd3] + vsqrtnepbf16 xmm2 {k7} {z}, xmm3 + +// CHECK: vsqrtnepbf16 zmm2, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0xd3] + vsqrtnepbf16 zmm2, zmm3 + +// CHECK: vsqrtnepbf16 zmm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0xd3] + vsqrtnepbf16 zmm2 {k7}, zmm3 + +// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0xd3] + vsqrtnepbf16 zmm2 {k7} {z}, zmm3 + +// CHECK: vsqrtnepbf16 ymm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0xd3] + vsqrtnepbf16 ymm2, ymm3 + +// CHECK: vsqrtnepbf16 ymm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0xd3] + vsqrtnepbf16 ymm2 {k7}, ymm3 + +// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0xd3] + vsqrtnepbf16 ymm2 {k7} {z}, ymm3 + +// CHECK: vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vsqrtnepbf16 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x51,0x10] + vsqrtnepbf16 xmm2, word ptr [eax]{1to8} + +// CHECK: vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f] + vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80] + vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vsqrtnepbf16 ymm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x51,0x10] + vsqrtnepbf16 ymm2, word ptr [eax]{1to16} + +// CHECK: vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f] + vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80] + vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vsqrtnepbf16 zmm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x51,0x10] + vsqrtnepbf16 zmm2, word ptr [eax]{1to32} + +// CHECK: vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff] + vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f] + vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80] + vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vsubnepbf16 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0xd4] + vsubnepbf16 ymm2, ymm3, ymm4 + +// CHECK: vsubnepbf16 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0xd4] + vsubnepbf16 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0xd4] + vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vsubnepbf16 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0xd4] + vsubnepbf16 zmm2, zmm3, zmm4 + +// CHECK: vsubnepbf16 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0xd4] + vsubnepbf16 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0xd4] + vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vsubnepbf16 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0xd4] + vsubnepbf16 xmm2, xmm3, xmm4 + +// CHECK: vsubnepbf16 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0xd4] + vsubnepbf16 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0xd4] + vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5c,0x10] + vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff] + vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f] + vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80] + vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5c,0x10] + vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff] + vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f] + vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80] + vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10] + vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00] + vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5c,0x10] + vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff] + vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f] + vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80] + vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-att.s b/llvm/test/MC/X86/avx10.2-bf16-64-att.s new file mode 100644 index 0000000000000..85d99cfe0a704 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-bf16-64-att.s @@ -0,0 +1,3014 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x58,0xf0] + vaddnepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x58,0xf0] + vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x58,0xf0] + vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x58,0xf0] + vaddnepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x58,0xf0] + vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x58,0xf0] + vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x58,0xf0] + vaddnepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x58,0xf0] + vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x58,0xf0] + vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vaddnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vaddnepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vaddnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff] + vaddnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vaddnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f] + vaddnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x58,0x72,0x80] + vaddnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vaddnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vaddnepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vaddnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff] + vaddnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vaddnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f] + vaddnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x58,0x72,0x80] + vaddnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vaddnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vaddnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vaddnepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vaddnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff] + vaddnepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vaddnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x58,0x71,0x7f] + vaddnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vaddnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x58,0x72,0x80] + vaddnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcmppbf16 $123, %ymm24, %ymm23, %k5 +// CHECK: encoding: [0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b] + vcmppbf16 $123, %ymm24, %ymm23, %k5 + +// CHECK: vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7} +// CHECK: encoding: [0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b] + vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, %xmm24, %xmm23, %k5 +// CHECK: encoding: [0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b] + vcmppbf16 $123, %xmm24, %xmm23, %k5 + +// CHECK: vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b] + vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, %zmm24, %zmm23, %k5 +// CHECK: encoding: [0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b] + vcmppbf16 $123, %zmm24, %zmm23, %k5 + +// CHECK: vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b] + vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%rbp,%r14,8), %zmm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%rbp,%r14,8), %zmm23, %k5 + +// CHECK: vcmppbf16 $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%rip){1to32}, %zmm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 $123, (%rip){1to32}, %zmm23, %k5 + +// CHECK: vcmppbf16 $123, -2048(,%rbp,2), %zmm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vcmppbf16 $123, -2048(,%rbp,2), %zmm23, %k5 + +// CHECK: vcmppbf16 $123, 8128(%rcx), %zmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 8128(%rcx), %zmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%rbp,%r14,8), %xmm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%rbp,%r14,8), %xmm23, %k5 + +// CHECK: vcmppbf16 $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%rip){1to8}, %xmm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 $123, (%rip){1to8}, %xmm23, %k5 + +// CHECK: vcmppbf16 $123, -512(,%rbp,2), %xmm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vcmppbf16 $123, -512(,%rbp,2), %xmm23, %k5 + +// CHECK: vcmppbf16 $123, 2032(%rcx), %xmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 2032(%rcx), %xmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, 268435456(%rbp,%r14,8), %ymm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 $123, 268435456(%rbp,%r14,8), %ymm23, %k5 + +// CHECK: vcmppbf16 $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, (%rip){1to16}, %ymm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 $123, (%rip){1to16}, %ymm23, %k5 + +// CHECK: vcmppbf16 $123, -1024(,%rbp,2), %ymm23, %k5 +// CHECK: encoding: [0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vcmppbf16 $123, -1024(,%rbp,2), %ymm23, %k5 + +// CHECK: vcmppbf16 $123, 4064(%rcx), %ymm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b] + vcmppbf16 $123, 4064(%rcx), %ymm23, %k5 {%k7} + +// CHECK: vcmppbf16 $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b] + vcmppbf16 $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7} + +// CHECK: vcomsbf16 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xf7] + vcomsbf16 %xmm23, %xmm22 + +// CHECK: vcomsbf16 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomsbf16 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcomsbf16 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomsbf16 291(%r8,%rax,4), %xmm22 + +// CHECK: vcomsbf16 (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomsbf16 (%rip), %xmm22 + +// CHECK: vcomsbf16 -64(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff] + vcomsbf16 -64(,%rbp,2), %xmm22 + +// CHECK: vcomsbf16 254(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f] + vcomsbf16 254(%rcx), %xmm22 + +// CHECK: vcomsbf16 -256(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80] + vcomsbf16 -256(%rdx), %xmm22 + +// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5e,0xf0] + vdivnepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5e,0xf0] + vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5e,0xf0] + vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5e,0xf0] + vdivnepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5e,0xf0] + vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5e,0xf0] + vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5e,0xf0] + vdivnepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5e,0xf0] + vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5e,0xf0] + vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vdivnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vdivnepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vdivnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vdivnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vdivnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f] + vdivnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80] + vdivnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vdivnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vdivnepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vdivnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vdivnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vdivnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f] + vdivnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80] + vdivnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdivnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vdivnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vdivnepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vdivnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vdivnepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vdivnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f] + vdivnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vdivnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5e,0x72,0x80] + vdivnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x98,0xf0] + vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x98,0xf0] + vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x98,0xf0] + vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x98,0xf0] + vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x98,0xf0] + vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x98,0xf0] + vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x98,0xf0] + vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x98,0xf0] + vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x98,0xf0] + vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f] + vfmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x98,0x72,0x80] + vfmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f] + vfmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x98,0x72,0x80] + vfmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x98,0x71,0x7f] + vfmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x98,0x72,0x80] + vfmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xa8,0xf0] + vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xa8,0xf0] + vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xa8,0xf0] + vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xa8,0xf0] + vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xa8,0xf0] + vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xa8,0xf0] + vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xa8,0xf0] + vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xa8,0xf0] + vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xa8,0xf0] + vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f] + vfmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80] + vfmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f] + vfmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80] + vfmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f] + vfmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xa8,0x72,0x80] + vfmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xb8,0xf0] + vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xb8,0xf0] + vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xb8,0xf0] + vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xb8,0xf0] + vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xb8,0xf0] + vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xb8,0xf0] + vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xb8,0xf0] + vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xb8,0xf0] + vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xb8,0xf0] + vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f] + vfmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80] + vfmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f] + vfmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80] + vfmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f] + vfmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xb8,0x72,0x80] + vfmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9a,0xf0] + vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9a,0xf0] + vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9a,0xf0] + vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9a,0xf0] + vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9a,0xf0] + vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9a,0xf0] + vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9a,0xf0] + vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9a,0xf0] + vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9a,0xf0] + vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f] + vfmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80] + vfmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f] + vfmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80] + vfmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f] + vfmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9a,0x72,0x80] + vfmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xaa,0xf0] + vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xaa,0xf0] + vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xaa,0xf0] + vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xaa,0xf0] + vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xaa,0xf0] + vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xaa,0xf0] + vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xaa,0xf0] + vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xaa,0xf0] + vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xaa,0xf0] + vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f] + vfmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80] + vfmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f] + vfmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80] + vfmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f] + vfmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xaa,0x72,0x80] + vfmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xba,0xf0] + vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xba,0xf0] + vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xba,0xf0] + vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xba,0xf0] + vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xba,0xf0] + vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xba,0xf0] + vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xba,0xf0] + vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xba,0xf0] + vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xba,0xf0] + vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f] + vfmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xba,0x72,0x80] + vfmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f] + vfmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xba,0x72,0x80] + vfmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xba,0x71,0x7f] + vfmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xba,0x72,0x80] + vfmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9c,0xf0] + vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9c,0xf0] + vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9c,0xf0] + vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9c,0xf0] + vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9c,0xf0] + vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9c,0xf0] + vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9c,0xf0] + vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9c,0xf0] + vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9c,0xf0] + vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f] + vfnmadd132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80] + vfnmadd132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f] + vfnmadd132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80] + vfnmadd132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f] + vfnmadd132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9c,0x72,0x80] + vfnmadd132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xac,0xf0] + vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xac,0xf0] + vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xac,0xf0] + vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xac,0xf0] + vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xac,0xf0] + vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xac,0xf0] + vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xac,0xf0] + vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xac,0xf0] + vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xac,0xf0] + vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f] + vfnmadd213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xac,0x72,0x80] + vfnmadd213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f] + vfnmadd213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xac,0x72,0x80] + vfnmadd213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xac,0x71,0x7f] + vfnmadd213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xac,0x72,0x80] + vfnmadd213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbc,0xf0] + vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbc,0xf0] + vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbc,0xf0] + vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbc,0xf0] + vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbc,0xf0] + vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbc,0xf0] + vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbc,0xf0] + vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbc,0xf0] + vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbc,0xf0] + vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f] + vfnmadd231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80] + vfnmadd231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f] + vfnmadd231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80] + vfnmadd231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f] + vfnmadd231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbc,0x72,0x80] + vfnmadd231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9e,0xf0] + vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9e,0xf0] + vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9e,0xf0] + vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9e,0xf0] + vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9e,0xf0] + vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9e,0xf0] + vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9e,0xf0] + vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9e,0xf0] + vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9e,0xf0] + vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub132nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f] + vfnmsub132nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80] + vfnmsub132nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub132nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f] + vfnmsub132nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80] + vfnmsub132nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub132nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f] + vfnmsub132nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9e,0x72,0x80] + vfnmsub132nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xae,0xf0] + vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xae,0xf0] + vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xae,0xf0] + vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xae,0xf0] + vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xae,0xf0] + vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xae,0xf0] + vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xae,0xf0] + vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xae,0xf0] + vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xae,0xf0] + vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub213nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f] + vfnmsub213nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xae,0x72,0x80] + vfnmsub213nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub213nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f] + vfnmsub213nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xae,0x72,0x80] + vfnmsub213nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub213nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xae,0x71,0x7f] + vfnmsub213nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xae,0x72,0x80] + vfnmsub213nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbe,0xf0] + vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbe,0xf0] + vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbe,0xf0] + vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbe,0xf0] + vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbe,0xf0] + vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbe,0xf0] + vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbe,0xf0] + vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbe,0xf0] + vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbe,0xf0] + vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vfnmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vfnmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vfnmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub231nepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vfnmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f] + vfnmsub231nepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80] + vfnmsub231nepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vfnmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vfnmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vfnmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub231nepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vfnmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f] + vfnmsub231nepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80] + vfnmsub231nepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vfnmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vfnmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vfnmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub231nepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vfnmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f] + vfnmsub231nepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfnmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbe,0x72,0x80] + vfnmsub231nepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vfpclasspbf16 $123, %zmm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b] + vfpclasspbf16 $123, %zmm23, %k5 + +// CHECK: vfpclasspbf16 $123, %zmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b] + vfpclasspbf16 $123, %zmm23, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, %ymm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b] + vfpclasspbf16 $123, %ymm23, %k5 + +// CHECK: vfpclasspbf16 $123, %ymm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b] + vfpclasspbf16 $123, %ymm23, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, %xmm23, %k5 +// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b] + vfpclasspbf16 $123, %xmm23, %k5 + +// CHECK: vfpclasspbf16 $123, %xmm23, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b] + vfpclasspbf16 $123, %xmm23, %k5 {%k7} + +// CHECK: vfpclasspbf16x $123, 268435456(%rbp,%r14,8), %k5 +// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vfpclasspbf16x $123, 268435456(%rbp,%r14,8), %k5 + +// CHECK: vfpclasspbf16x $123, 291(%r8,%rax,4), %k5 {%k7} +// CHECK: encoding: [0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vfpclasspbf16x $123, 291(%r8,%rax,4), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%rip){1to8}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 $123, (%rip){1to8}, %k5 + +// CHECK: vfpclasspbf16x $123, -512(,%rbp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vfpclasspbf16x $123, -512(,%rbp,2), %k5 + +// CHECK: vfpclasspbf16x $123, 2032(%rcx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16x $123, 2032(%rcx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%rdx){1to8}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%rdx){1to8}, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%rip){1to16}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 $123, (%rip){1to16}, %k5 + +// CHECK: vfpclasspbf16y $123, -1024(,%rbp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vfpclasspbf16y $123, -1024(,%rbp,2), %k5 + +// CHECK: vfpclasspbf16y $123, 4064(%rcx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16y $123, 4064(%rcx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%rdx){1to16}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%rdx){1to16}, %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, (%rip){1to32}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 $123, (%rip){1to32}, %k5 + +// CHECK: vfpclasspbf16z $123, -2048(,%rbp,2), %k5 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vfpclasspbf16z $123, -2048(,%rbp,2), %k5 + +// CHECK: vfpclasspbf16z $123, 8128(%rcx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16z $123, 8128(%rcx), %k5 {%k7} + +// CHECK: vfpclasspbf16 $123, -256(%rdx){1to32}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 $123, -256(%rdx){1to32}, %k5 {%k7} + +// CHECK: vgetexppbf16 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7] + vgetexppbf16 %xmm23, %xmm22 + +// CHECK: vgetexppbf16 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7] + vgetexppbf16 %xmm23, %xmm22 {%k7} + +// CHECK: vgetexppbf16 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7] + vgetexppbf16 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vgetexppbf16 %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7] + vgetexppbf16 %zmm23, %zmm22 + +// CHECK: vgetexppbf16 %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7] + vgetexppbf16 %zmm23, %zmm22 {%k7} + +// CHECK: vgetexppbf16 %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7] + vgetexppbf16 %zmm23, %zmm22 {%k7} {z} + +// CHECK: vgetexppbf16 %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7] + vgetexppbf16 %ymm23, %ymm22 + +// CHECK: vgetexppbf16 %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7] + vgetexppbf16 %ymm23, %ymm22 {%k7} + +// CHECK: vgetexppbf16 %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7] + vgetexppbf16 %ymm23, %ymm22 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vgetexppbf16 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vgetexppbf16 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 (%rip){1to8}, %xmm22 + +// CHECK: vgetexppbf16 -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff] + vgetexppbf16 -512(,%rbp,2), %xmm22 + +// CHECK: vgetexppbf16 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f] + vgetexppbf16 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80] + vgetexppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vgetexppbf16 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vgetexppbf16 (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 (%rip){1to16}, %ymm22 + +// CHECK: vgetexppbf16 -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff] + vgetexppbf16 -1024(,%rbp,2), %ymm22 + +// CHECK: vgetexppbf16 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f] + vgetexppbf16 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80] + vgetexppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vgetexppbf16 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vgetexppbf16 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vgetexppbf16 (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 (%rip){1to32}, %zmm22 + +// CHECK: vgetexppbf16 -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff] + vgetexppbf16 -2048(,%rbp,2), %zmm22 + +// CHECK: vgetexppbf16 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f] + vgetexppbf16 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vgetexppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80] + vgetexppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %zmm23, %zmm22 + +// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} + +// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %ymm23, %ymm22 + +// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} + +// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %xmm23, %xmm22 + +// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} + +// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b] + vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vgetmantpbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vgetmantpbf16 $123, (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 $123, (%rip){1to8}, %xmm22 + +// CHECK: vgetmantpbf16 $123, -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vgetmantpbf16 $123, -512(,%rbp,2), %xmm22 + +// CHECK: vgetmantpbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b] + vgetmantpbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vgetmantpbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vgetmantpbf16 $123, (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 $123, (%rip){1to16}, %ymm22 + +// CHECK: vgetmantpbf16 $123, -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vgetmantpbf16 $123, -1024(,%rbp,2), %ymm22 + +// CHECK: vgetmantpbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b] + vgetmantpbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 $123, 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vgetmantpbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vgetmantpbf16 $123, (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 $123, (%rip){1to32}, %zmm22 + +// CHECK: vgetmantpbf16 $123, -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vgetmantpbf16 $123, -2048(,%rbp,2), %zmm22 + +// CHECK: vgetmantpbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vgetmantpbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b] + vgetmantpbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5f,0xf0] + vmaxpbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5f,0xf0] + vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5f,0xf0] + vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5f,0xf0] + vmaxpbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5f,0xf0] + vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5f,0xf0] + vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5f,0xf0] + vmaxpbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5f,0xf0] + vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5f,0xf0] + vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vmaxpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vmaxpbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vmaxpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff] + vmaxpbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vmaxpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f] + vmaxpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80] + vmaxpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vmaxpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vmaxpbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vmaxpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff] + vmaxpbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vmaxpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f] + vmaxpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80] + vmaxpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmaxpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vmaxpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vmaxpbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vmaxpbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff] + vmaxpbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vmaxpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f] + vmaxpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vmaxpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5f,0x72,0x80] + vmaxpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5d,0xf0] + vminpbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5d,0xf0] + vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5d,0xf0] + vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5d,0xf0] + vminpbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5d,0xf0] + vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5d,0xf0] + vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5d,0xf0] + vminpbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5d,0xf0] + vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5d,0xf0] + vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vminpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vminpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vminpbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vminpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff] + vminpbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vminpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f] + vminpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vminpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80] + vminpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vminpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vminpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vminpbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vminpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff] + vminpbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vminpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f] + vminpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vminpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80] + vminpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vminpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vminpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vminpbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vminpbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff] + vminpbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vminpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f] + vminpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vminpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5d,0x72,0x80] + vminpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x59,0xf0] + vmulnepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x59,0xf0] + vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x59,0xf0] + vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x59,0xf0] + vmulnepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x59,0xf0] + vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x59,0xf0] + vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x59,0xf0] + vmulnepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x59,0xf0] + vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x59,0xf0] + vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vmulnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vmulnepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vmulnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff] + vmulnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vmulnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f] + vmulnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x59,0x72,0x80] + vmulnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vmulnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vmulnepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vmulnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff] + vmulnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vmulnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f] + vmulnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x59,0x72,0x80] + vmulnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vmulnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vmulnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vmulnepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vmulnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff] + vmulnepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vmulnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x59,0x71,0x7f] + vmulnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vmulnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x59,0x72,0x80] + vmulnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vrcppbf16 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xf7] + vrcppbf16 %xmm23, %xmm22 + +// CHECK: vrcppbf16 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4c,0xf7] + vrcppbf16 %xmm23, %xmm22 {%k7} + +// CHECK: vrcppbf16 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4c,0xf7] + vrcppbf16 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vrcppbf16 %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xf7] + vrcppbf16 %zmm23, %zmm22 + +// CHECK: vrcppbf16 %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4c,0xf7] + vrcppbf16 %zmm23, %zmm22 {%k7} + +// CHECK: vrcppbf16 %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4c,0xf7] + vrcppbf16 %zmm23, %zmm22 {%k7} {z} + +// CHECK: vrcppbf16 %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xf7] + vrcppbf16 %ymm23, %ymm22 + +// CHECK: vrcppbf16 %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4c,0xf7] + vrcppbf16 %ymm23, %ymm22 {%k7} + +// CHECK: vrcppbf16 %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4c,0xf7] + vrcppbf16 %ymm23, %ymm22 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vrcppbf16 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vrcppbf16 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 (%rip){1to8}, %xmm22 + +// CHECK: vrcppbf16 -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vrcppbf16 -512(,%rbp,2), %xmm22 + +// CHECK: vrcppbf16 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f] + vrcppbf16 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vrcppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80] + vrcppbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vrcppbf16 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vrcppbf16 (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 (%rip){1to16}, %ymm22 + +// CHECK: vrcppbf16 -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vrcppbf16 -1024(,%rbp,2), %ymm22 + +// CHECK: vrcppbf16 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f] + vrcppbf16 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vrcppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80] + vrcppbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vrcppbf16 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vrcppbf16 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vrcppbf16 (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 (%rip){1to32}, %zmm22 + +// CHECK: vrcppbf16 -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vrcppbf16 -2048(,%rbp,2), %zmm22 + +// CHECK: vrcppbf16 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f] + vrcppbf16 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vrcppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80] + vrcppbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b] + vreducenepbf16 $123, %zmm23, %zmm22 + +// CHECK: vreducenepbf16 $123, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b] + vreducenepbf16 $123, %zmm23, %zmm22 {%k7} + +// CHECK: vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b] + vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b] + vreducenepbf16 $123, %ymm23, %ymm22 + +// CHECK: vreducenepbf16 $123, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b] + vreducenepbf16 $123, %ymm23, %ymm22 {%k7} + +// CHECK: vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b] + vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b] + vreducenepbf16 $123, %xmm23, %xmm22 + +// CHECK: vreducenepbf16 $123, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b] + vreducenepbf16 $123, %xmm23, %xmm22 {%k7} + +// CHECK: vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b] + vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vreducenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vreducenepbf16 $123, (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 $123, (%rip){1to8}, %xmm22 + +// CHECK: vreducenepbf16 $123, -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vreducenepbf16 $123, -512(,%rbp,2), %xmm22 + +// CHECK: vreducenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b] + vreducenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b] + vreducenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vreducenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vreducenepbf16 $123, (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 $123, (%rip){1to16}, %ymm22 + +// CHECK: vreducenepbf16 $123, -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vreducenepbf16 $123, -1024(,%rbp,2), %ymm22 + +// CHECK: vreducenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b] + vreducenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b] + vreducenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vreducenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vreducenepbf16 $123, (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 $123, (%rip){1to32}, %zmm22 + +// CHECK: vreducenepbf16 $123, -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vreducenepbf16 $123, -2048(,%rbp,2), %zmm22 + +// CHECK: vreducenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b] + vreducenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vreducenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b] + vreducenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %zmm23, %zmm22 + +// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %ymm23, %ymm22 + +// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %xmm23, %xmm22 + +// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b] + vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vrndscalenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 $123, (%rip){1to8}, %xmm22 + +// CHECK: vrndscalenepbf16 $123, -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -512(,%rbp,2), %xmm22 + +// CHECK: vrndscalenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 $123, 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 $123, -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vrndscalenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 $123, (%rip){1to16}, %ymm22 + +// CHECK: vrndscalenepbf16 $123, -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -1024(,%rbp,2), %ymm22 + +// CHECK: vrndscalenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 $123, 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 $123, -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 $123, 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vrndscalenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 $123, 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vrndscalenepbf16 $123, (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 $123, (%rip){1to32}, %zmm22 + +// CHECK: vrndscalenepbf16 $123, -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vrndscalenepbf16 $123, -2048(,%rbp,2), %zmm22 + +// CHECK: vrndscalenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 $123, 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vrndscalenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 $123, -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xf7] + vrsqrtpbf16 %xmm23, %xmm22 + +// CHECK: vrsqrtpbf16 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4e,0xf7] + vrsqrtpbf16 %xmm23, %xmm22 {%k7} + +// CHECK: vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4e,0xf7] + vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xf7] + vrsqrtpbf16 %zmm23, %zmm22 + +// CHECK: vrsqrtpbf16 %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4e,0xf7] + vrsqrtpbf16 %zmm23, %zmm22 {%k7} + +// CHECK: vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4e,0xf7] + vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xf7] + vrsqrtpbf16 %ymm23, %ymm22 + +// CHECK: vrsqrtpbf16 %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4e,0xf7] + vrsqrtpbf16 %ymm23, %ymm22 {%k7} + +// CHECK: vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4e,0xf7] + vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vrsqrtpbf16 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vrsqrtpbf16 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 (%rip){1to8}, %xmm22 + +// CHECK: vrsqrtpbf16 -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vrsqrtpbf16 -512(,%rbp,2), %xmm22 + +// CHECK: vrsqrtpbf16 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f] + vrsqrtpbf16 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80] + vrsqrtpbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vrsqrtpbf16 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vrsqrtpbf16 (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 (%rip){1to16}, %ymm22 + +// CHECK: vrsqrtpbf16 -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vrsqrtpbf16 -1024(,%rbp,2), %ymm22 + +// CHECK: vrsqrtpbf16 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f] + vrsqrtpbf16 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80] + vrsqrtpbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vrsqrtpbf16 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vrsqrtpbf16 (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 (%rip){1to32}, %zmm22 + +// CHECK: vrsqrtpbf16 -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vrsqrtpbf16 -2048(,%rbp,2), %zmm22 + +// CHECK: vrsqrtpbf16 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f] + vrsqrtpbf16 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vrsqrtpbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80] + vrsqrtpbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x2c,0xf0] + vscalefpbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x2c,0xf0] + vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x2c,0xf0] + vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x2c,0xf0] + vscalefpbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x2c,0xf0] + vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x2c,0xf0] + vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x2c,0xf0] + vscalefpbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x2c,0xf0] + vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x2c,0xf0] + vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vscalefpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vscalefpbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vscalefpbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vscalefpbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vscalefpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f] + vscalefpbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80] + vscalefpbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vscalefpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vscalefpbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vscalefpbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vscalefpbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vscalefpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f] + vscalefpbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80] + vscalefpbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vscalefpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vscalefpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vscalefpbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vscalefpbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vscalefpbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vscalefpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f] + vscalefpbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vscalefpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x2c,0x72,0x80] + vscalefpbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xf7] + vsqrtnepbf16 %xmm23, %xmm22 + +// CHECK: vsqrtnepbf16 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x51,0xf7] + vsqrtnepbf16 %xmm23, %xmm22 {%k7} + +// CHECK: vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x51,0xf7] + vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xf7] + vsqrtnepbf16 %zmm23, %zmm22 + +// CHECK: vsqrtnepbf16 %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x51,0xf7] + vsqrtnepbf16 %zmm23, %zmm22 {%k7} + +// CHECK: vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x51,0xf7] + vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xf7] + vsqrtnepbf16 %ymm23, %ymm22 + +// CHECK: vsqrtnepbf16 %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x51,0xf7] + vsqrtnepbf16 %ymm23, %ymm22 {%k7} + +// CHECK: vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x51,0xf7] + vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vsqrtnepbf16 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vsqrtnepbf16 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 (%rip){1to8}, %xmm22 + +// CHECK: vsqrtnepbf16 -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + vsqrtnepbf16 -512(,%rbp,2), %xmm22 + +// CHECK: vsqrtnepbf16 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f] + vsqrtnepbf16 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80] + vsqrtnepbf16 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vsqrtnepbf16 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vsqrtnepbf16 (%rip){1to16}, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 (%rip){1to16}, %ymm22 + +// CHECK: vsqrtnepbf16 -1024(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + vsqrtnepbf16 -1024(,%rbp,2), %ymm22 + +// CHECK: vsqrtnepbf16 4064(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f] + vsqrtnepbf16 4064(%rcx), %ymm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80] + vsqrtnepbf16 -256(%rdx){1to16}, %ymm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vsqrtnepbf16 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vsqrtnepbf16 (%rip){1to32}, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 (%rip){1to32}, %zmm22 + +// CHECK: vsqrtnepbf16 -2048(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff] + vsqrtnepbf16 -2048(,%rbp,2), %zmm22 + +// CHECK: vsqrtnepbf16 8128(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f] + vsqrtnepbf16 8128(%rcx), %zmm22 {%k7} {z} + +// CHECK: vsqrtnepbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80] + vsqrtnepbf16 -256(%rdx){1to32}, %zmm22 {%k7} {z} + +// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5c,0xf0] + vsubnepbf16 %ymm24, %ymm23, %ymm22 + +// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5c,0xf0] + vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5c,0xf0] + vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5c,0xf0] + vsubnepbf16 %zmm24, %zmm23, %zmm22 + +// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5c,0xf0] + vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5c,0xf0] + vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5c,0xf0] + vsubnepbf16 %xmm24, %xmm23, %xmm22 + +// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5c,0xf0] + vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5c,0xf0] + vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vsubnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vsubnepbf16 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vsubnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vsubnepbf16 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vsubnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f] + vsubnepbf16 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80] + vsubnepbf16 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vsubnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vsubnepbf16 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vsubnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vsubnepbf16 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vsubnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f] + vsubnepbf16 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80] + vsubnepbf16 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vsubnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vsubnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vsubnepbf16 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vsubnepbf16 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vsubnepbf16 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vsubnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f] + vsubnepbf16 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vsubnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5c,0x72,0x80] + vsubnepbf16 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-intel.s b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s new file mode 100644 index 0000000000000..5f3dc45ba7745 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s @@ -0,0 +1,3014 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vaddnepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x58,0xf0] + vaddnepbf16 ymm22, ymm23, ymm24 + +// CHECK: vaddnepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x58,0xf0] + vaddnepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x58,0xf0] + vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vaddnepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x58,0xf0] + vaddnepbf16 zmm22, zmm23, zmm24 + +// CHECK: vaddnepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x58,0xf0] + vaddnepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x58,0xf0] + vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vaddnepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x58,0xf0] + vaddnepbf16 xmm22, xmm23, xmm24 + +// CHECK: vaddnepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x58,0xf0] + vaddnepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x58,0xf0] + vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff] + vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f] + vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x58,0x72,0x80] + vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff] + vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f] + vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x58,0x72,0x80] + vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00] + vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00] + vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff] + vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x58,0x71,0x7f] + vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x58,0x72,0x80] + vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcmppbf16 k5, ymm23, ymm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b] + vcmppbf16 k5, ymm23, ymm24, 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm23, ymm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b] + vcmppbf16 k5 {k7}, ymm23, ymm24, 123 + +// CHECK: vcmppbf16 k5, xmm23, xmm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b] + vcmppbf16 k5, xmm23, xmm24, 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm23, xmm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b] + vcmppbf16 k5 {k7}, xmm23, xmm24, 123 + +// CHECK: vcmppbf16 k5, zmm23, zmm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b] + vcmppbf16 k5, zmm23, zmm24, 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm23, zmm24, 123 +// CHECK: encoding: [0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b] + vcmppbf16 k5 {k7}, zmm23, zmm24, 123 + +// CHECK: vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123 + +// CHECK: vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123 + +// CHECK: vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123 + +// CHECK: vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123 + +// CHECK: vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123 + +// CHECK: vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b] + vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123 + +// CHECK: vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b] + vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123 + +// CHECK: vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b] + vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vcomsbf16 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xf7] + vcomsbf16 xmm22, xmm23 + +// CHECK: vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291] + +// CHECK: vcomsbf16 xmm22, word ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomsbf16 xmm22, word ptr [rip] + +// CHECK: vcomsbf16 xmm22, word ptr [2*rbp - 64] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff] + vcomsbf16 xmm22, word ptr [2*rbp - 64] + +// CHECK: vcomsbf16 xmm22, word ptr [rcx + 254] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f] + vcomsbf16 xmm22, word ptr [rcx + 254] + +// CHECK: vcomsbf16 xmm22, word ptr [rdx - 256] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80] + vcomsbf16 xmm22, word ptr [rdx - 256] + +// CHECK: vdivnepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5e,0xf0] + vdivnepbf16 ymm22, ymm23, ymm24 + +// CHECK: vdivnepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5e,0xf0] + vdivnepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5e,0xf0] + vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vdivnepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5e,0xf0] + vdivnepbf16 zmm22, zmm23, zmm24 + +// CHECK: vdivnepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5e,0xf0] + vdivnepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5e,0xf0] + vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vdivnepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5e,0xf0] + vdivnepbf16 xmm22, xmm23, xmm24 + +// CHECK: vdivnepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5e,0xf0] + vdivnepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5e,0xf0] + vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f] + vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80] + vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f] + vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80] + vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00] + vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00] + vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f] + vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5e,0x72,0x80] + vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x98,0xf0] + vfmadd132nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x98,0xf0] + vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x98,0xf0] + vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x98,0xf0] + vfmadd132nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x98,0xf0] + vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x98,0xf0] + vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x98,0xf0] + vfmadd132nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x98,0xf0] + vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x98,0xf0] + vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f] + vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x98,0x72,0x80] + vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f] + vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x98,0x72,0x80] + vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00] + vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x98,0x71,0x7f] + vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x98,0x72,0x80] + vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xa8,0xf0] + vfmadd213nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xa8,0xf0] + vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xa8,0xf0] + vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xa8,0xf0] + vfmadd213nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xa8,0xf0] + vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xa8,0xf0] + vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xa8,0xf0] + vfmadd213nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xa8,0xf0] + vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xa8,0xf0] + vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f] + vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80] + vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f] + vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80] + vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00] + vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f] + vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xa8,0x72,0x80] + vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xb8,0xf0] + vfmadd231nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xb8,0xf0] + vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xb8,0xf0] + vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xb8,0xf0] + vfmadd231nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xb8,0xf0] + vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xb8,0xf0] + vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xb8,0xf0] + vfmadd231nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xb8,0xf0] + vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xb8,0xf0] + vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f] + vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80] + vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f] + vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80] + vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00] + vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f] + vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xb8,0x72,0x80] + vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9a,0xf0] + vfmsub132nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9a,0xf0] + vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9a,0xf0] + vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9a,0xf0] + vfmsub132nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9a,0xf0] + vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9a,0xf0] + vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9a,0xf0] + vfmsub132nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9a,0xf0] + vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9a,0xf0] + vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f] + vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80] + vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f] + vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80] + vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00] + vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f] + vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9a,0x72,0x80] + vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xaa,0xf0] + vfmsub213nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xaa,0xf0] + vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xaa,0xf0] + vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xaa,0xf0] + vfmsub213nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xaa,0xf0] + vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xaa,0xf0] + vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xaa,0xf0] + vfmsub213nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xaa,0xf0] + vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xaa,0xf0] + vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f] + vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80] + vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f] + vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80] + vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00] + vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f] + vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xaa,0x72,0x80] + vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xba,0xf0] + vfmsub231nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xba,0xf0] + vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xba,0xf0] + vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xba,0xf0] + vfmsub231nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xba,0xf0] + vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xba,0xf0] + vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xba,0xf0] + vfmsub231nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xba,0xf0] + vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xba,0xf0] + vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f] + vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xba,0x72,0x80] + vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f] + vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xba,0x72,0x80] + vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00] + vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00] + vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xba,0x71,0x7f] + vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xba,0x72,0x80] + vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9c,0xf0] + vfnmadd132nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9c,0xf0] + vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9c,0xf0] + vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9c,0xf0] + vfnmadd132nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9c,0xf0] + vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9c,0xf0] + vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9c,0xf0] + vfnmadd132nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9c,0xf0] + vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9c,0xf0] + vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f] + vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80] + vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f] + vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80] + vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00] + vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f] + vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9c,0x72,0x80] + vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xac,0xf0] + vfnmadd213nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xac,0xf0] + vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xac,0xf0] + vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xac,0xf0] + vfnmadd213nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xac,0xf0] + vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xac,0xf0] + vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xac,0xf0] + vfnmadd213nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xac,0xf0] + vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xac,0xf0] + vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f] + vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xac,0x72,0x80] + vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f] + vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xac,0x72,0x80] + vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00] + vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xac,0x71,0x7f] + vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xac,0x72,0x80] + vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbc,0xf0] + vfnmadd231nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbc,0xf0] + vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbc,0xf0] + vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbc,0xf0] + vfnmadd231nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbc,0xf0] + vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbc,0xf0] + vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbc,0xf0] + vfnmadd231nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbc,0xf0] + vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbc,0xf0] + vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f] + vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80] + vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f] + vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80] + vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00] + vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f] + vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbc,0x72,0x80] + vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9e,0xf0] + vfnmsub132nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9e,0xf0] + vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9e,0xf0] + vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9e,0xf0] + vfnmsub132nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9e,0xf0] + vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9e,0xf0] + vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9e,0xf0] + vfnmsub132nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9e,0xf0] + vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9e,0xf0] + vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f] + vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80] + vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f] + vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80] + vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00] + vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f] + vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9e,0x72,0x80] + vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xae,0xf0] + vfnmsub213nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xae,0xf0] + vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xae,0xf0] + vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xae,0xf0] + vfnmsub213nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xae,0xf0] + vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xae,0xf0] + vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xae,0xf0] + vfnmsub213nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xae,0xf0] + vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xae,0xf0] + vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f] + vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xae,0x72,0x80] + vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f] + vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xae,0x72,0x80] + vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00] + vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xae,0x71,0x7f] + vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xae,0x72,0x80] + vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbe,0xf0] + vfnmsub231nepbf16 ymm22, ymm23, ymm24 + +// CHECK: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbe,0xf0] + vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbe,0xf0] + vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbe,0xf0] + vfnmsub231nepbf16 zmm22, zmm23, zmm24 + +// CHECK: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbe,0xf0] + vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbe,0xf0] + vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbe,0xf0] + vfnmsub231nepbf16 xmm22, xmm23, xmm24 + +// CHECK: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbe,0xf0] + vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbe,0xf0] + vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff] + vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f] + vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80] + vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff] + vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f] + vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80] + vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00] + vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00] + vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff] + vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f] + vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbe,0x72,0x80] + vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vfpclasspbf16 k5, zmm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b] + vfpclasspbf16 k5, zmm23, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, zmm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b] + vfpclasspbf16 k5 {k7}, zmm23, 123 + +// CHECK: vfpclasspbf16 k5, ymm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b] + vfpclasspbf16 k5, ymm23, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, ymm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b] + vfpclasspbf16 k5 {k7}, ymm23, 123 + +// CHECK: vfpclasspbf16 k5, xmm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b] + vfpclasspbf16 k5, xmm23, 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmm23, 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b] + vfpclasspbf16 k5 {k7}, xmm23, 123 + +// CHECK: vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b] + vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 k5, word ptr [rip]{1to8}, 123 + +// CHECK: vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 k5, word ptr [rip]{1to16}, 123 + +// CHECK: vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b] + vfpclasspbf16 k5, word ptr [rip]{1to32}, 123 + +// CHECK: vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b] + vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123 + +// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b] + vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123 + +// CHECK: vgetexppbf16 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7] + vgetexppbf16 xmm22, xmm23 + +// CHECK: vgetexppbf16 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7] + vgetexppbf16 xmm22 {k7}, xmm23 + +// CHECK: vgetexppbf16 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7] + vgetexppbf16 xmm22 {k7} {z}, xmm23 + +// CHECK: vgetexppbf16 zmm22, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7] + vgetexppbf16 zmm22, zmm23 + +// CHECK: vgetexppbf16 zmm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7] + vgetexppbf16 zmm22 {k7}, zmm23 + +// CHECK: vgetexppbf16 zmm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7] + vgetexppbf16 zmm22 {k7} {z}, zmm23 + +// CHECK: vgetexppbf16 ymm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7] + vgetexppbf16 ymm22, ymm23 + +// CHECK: vgetexppbf16 ymm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7] + vgetexppbf16 ymm22 {k7}, ymm23 + +// CHECK: vgetexppbf16 ymm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7] + vgetexppbf16 ymm22 {k7} {z}, ymm23 + +// CHECK: vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vgetexppbf16 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 xmm22, word ptr [rip]{1to8} + +// CHECK: vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff] + vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f] + vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80] + vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vgetexppbf16 ymm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 ymm22, word ptr [rip]{1to16} + +// CHECK: vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff] + vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f] + vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80] + vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00] + vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vgetexppbf16 zmm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00] + vgetexppbf16 zmm22, word ptr [rip]{1to32} + +// CHECK: vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff] + vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f] + vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80] + vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vgetmantpbf16 zmm22, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b] + vgetmantpbf16 zmm22, zmm23, 123 + +// CHECK: vgetmantpbf16 zmm22 {k7}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b] + vgetmantpbf16 zmm22 {k7}, zmm23, 123 + +// CHECK: vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b] + vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123 + +// CHECK: vgetmantpbf16 ymm22, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b] + vgetmantpbf16 ymm22, ymm23, 123 + +// CHECK: vgetmantpbf16 ymm22 {k7}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b] + vgetmantpbf16 ymm22 {k7}, ymm23, 123 + +// CHECK: vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b] + vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123 + +// CHECK: vgetmantpbf16 xmm22, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b] + vgetmantpbf16 xmm22, xmm23, 123 + +// CHECK: vgetmantpbf16 xmm22 {k7}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b] + vgetmantpbf16 xmm22 {k7}, xmm23, 123 + +// CHECK: vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b] + vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123 + +// CHECK: vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123 + +// CHECK: vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123 + +// CHECK: vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 + +// CHECK: vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b] + vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123 + +// CHECK: vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 + +// CHECK: vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 + +// CHECK: vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b] + vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b] + vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123 + +// CHECK: vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 + +// CHECK: vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b] + vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 + +// CHECK: vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b] + vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 + +// CHECK: vmaxpbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5f,0xf0] + vmaxpbf16 ymm22, ymm23, ymm24 + +// CHECK: vmaxpbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5f,0xf0] + vmaxpbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5f,0xf0] + vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vmaxpbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5f,0xf0] + vmaxpbf16 zmm22, zmm23, zmm24 + +// CHECK: vmaxpbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5f,0xf0] + vmaxpbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5f,0xf0] + vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vmaxpbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5f,0xf0] + vmaxpbf16 xmm22, xmm23, xmm24 + +// CHECK: vmaxpbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5f,0xf0] + vmaxpbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5f,0xf0] + vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff] + vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f] + vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80] + vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff] + vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f] + vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80] + vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00] + vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00] + vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff] + vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f] + vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5f,0x72,0x80] + vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vminpbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5d,0xf0] + vminpbf16 ymm22, ymm23, ymm24 + +// CHECK: vminpbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5d,0xf0] + vminpbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5d,0xf0] + vminpbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vminpbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5d,0xf0] + vminpbf16 zmm22, zmm23, zmm24 + +// CHECK: vminpbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5d,0xf0] + vminpbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5d,0xf0] + vminpbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vminpbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5d,0xf0] + vminpbf16 xmm22, xmm23, xmm24 + +// CHECK: vminpbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5d,0xf0] + vminpbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5d,0xf0] + vminpbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vminpbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff] + vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f] + vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80] + vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vminpbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff] + vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f] + vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80] + vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00] + vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vminpbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00] + vminpbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff] + vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f] + vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5d,0x72,0x80] + vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vmulnepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x59,0xf0] + vmulnepbf16 ymm22, ymm23, ymm24 + +// CHECK: vmulnepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x59,0xf0] + vmulnepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x59,0xf0] + vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vmulnepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x59,0xf0] + vmulnepbf16 zmm22, zmm23, zmm24 + +// CHECK: vmulnepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x59,0xf0] + vmulnepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x59,0xf0] + vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vmulnepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x59,0xf0] + vmulnepbf16 xmm22, xmm23, xmm24 + +// CHECK: vmulnepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x59,0xf0] + vmulnepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x59,0xf0] + vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff] + vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f] + vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x59,0x72,0x80] + vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff] + vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f] + vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x59,0x72,0x80] + vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00] + vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00] + vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff] + vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x59,0x71,0x7f] + vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x59,0x72,0x80] + vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vrcppbf16 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xf7] + vrcppbf16 xmm22, xmm23 + +// CHECK: vrcppbf16 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4c,0xf7] + vrcppbf16 xmm22 {k7}, xmm23 + +// CHECK: vrcppbf16 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4c,0xf7] + vrcppbf16 xmm22 {k7} {z}, xmm23 + +// CHECK: vrcppbf16 zmm22, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xf7] + vrcppbf16 zmm22, zmm23 + +// CHECK: vrcppbf16 zmm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4c,0xf7] + vrcppbf16 zmm22 {k7}, zmm23 + +// CHECK: vrcppbf16 zmm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4c,0xf7] + vrcppbf16 zmm22 {k7} {z}, zmm23 + +// CHECK: vrcppbf16 ymm22, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xf7] + vrcppbf16 ymm22, ymm23 + +// CHECK: vrcppbf16 ymm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4c,0xf7] + vrcppbf16 ymm22 {k7}, ymm23 + +// CHECK: vrcppbf16 ymm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4c,0xf7] + vrcppbf16 ymm22 {k7} {z}, ymm23 + +// CHECK: vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vrcppbf16 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 xmm22, word ptr [rip]{1to8} + +// CHECK: vrcppbf16 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vrcppbf16 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f] + vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80] + vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vrcppbf16 ymm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 ymm22, word ptr [rip]{1to16} + +// CHECK: vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f] + vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80] + vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00] + vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vrcppbf16 zmm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00] + vrcppbf16 zmm22, word ptr [rip]{1to32} + +// CHECK: vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f] + vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80] + vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vreducenepbf16 zmm22, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b] + vreducenepbf16 zmm22, zmm23, 123 + +// CHECK: vreducenepbf16 zmm22 {k7}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b] + vreducenepbf16 zmm22 {k7}, zmm23, 123 + +// CHECK: vreducenepbf16 zmm22 {k7} {z}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b] + vreducenepbf16 zmm22 {k7} {z}, zmm23, 123 + +// CHECK: vreducenepbf16 ymm22, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b] + vreducenepbf16 ymm22, ymm23, 123 + +// CHECK: vreducenepbf16 ymm22 {k7}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b] + vreducenepbf16 ymm22 {k7}, ymm23, 123 + +// CHECK: vreducenepbf16 ymm22 {k7} {z}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b] + vreducenepbf16 ymm22 {k7} {z}, ymm23, 123 + +// CHECK: vreducenepbf16 xmm22, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b] + vreducenepbf16 xmm22, xmm23, 123 + +// CHECK: vreducenepbf16 xmm22 {k7}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b] + vreducenepbf16 xmm22 {k7}, xmm23, 123 + +// CHECK: vreducenepbf16 xmm22 {k7} {z}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b] + vreducenepbf16 xmm22 {k7} {z}, xmm23, 123 + +// CHECK: vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123 + +// CHECK: vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 + +// CHECK: vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b] + vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 + +// CHECK: vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b] + vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123 + +// CHECK: vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 + +// CHECK: vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b] + vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 + +// CHECK: vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b] + vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b] + vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123 + +// CHECK: vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 + +// CHECK: vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b] + vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 + +// CHECK: vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b] + vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 + +// CHECK: vrndscalenepbf16 zmm22, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b] + vrndscalenepbf16 zmm22, zmm23, 123 + +// CHECK: vrndscalenepbf16 zmm22 {k7}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b] + vrndscalenepbf16 zmm22 {k7}, zmm23, 123 + +// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b] + vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123 + +// CHECK: vrndscalenepbf16 ymm22, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b] + vrndscalenepbf16 ymm22, ymm23, 123 + +// CHECK: vrndscalenepbf16 ymm22 {k7}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b] + vrndscalenepbf16 ymm22 {k7}, ymm23, 123 + +// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b] + vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123 + +// CHECK: vrndscalenepbf16 xmm22, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b] + vrndscalenepbf16 xmm22, xmm23, 123 + +// CHECK: vrndscalenepbf16 xmm22 {k7}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b] + vrndscalenepbf16 xmm22 {k7}, xmm23, 123 + +// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b] + vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123 + +// CHECK: vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123 + +// CHECK: vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b] + vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123 + +// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123 + +// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123 + +// CHECK: vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b] + vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123 + +// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123 + +// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 +// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b] + vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123 + +// CHECK: vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b] + vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123 + +// CHECK: vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b] + vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123 + +// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b] + vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123 + +// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b] + vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 + +// CHECK: vrsqrtpbf16 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xf7] + vrsqrtpbf16 xmm22, xmm23 + +// CHECK: vrsqrtpbf16 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4e,0xf7] + vrsqrtpbf16 xmm22 {k7}, xmm23 + +// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4e,0xf7] + vrsqrtpbf16 xmm22 {k7} {z}, xmm23 + +// CHECK: vrsqrtpbf16 zmm22, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xf7] + vrsqrtpbf16 zmm22, zmm23 + +// CHECK: vrsqrtpbf16 zmm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4e,0xf7] + vrsqrtpbf16 zmm22 {k7}, zmm23 + +// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4e,0xf7] + vrsqrtpbf16 zmm22 {k7} {z}, zmm23 + +// CHECK: vrsqrtpbf16 ymm22, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xf7] + vrsqrtpbf16 ymm22, ymm23 + +// CHECK: vrsqrtpbf16 ymm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4e,0xf7] + vrsqrtpbf16 ymm22 {k7}, ymm23 + +// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4e,0xf7] + vrsqrtpbf16 ymm22 {k7} {z}, ymm23 + +// CHECK: vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vrsqrtpbf16 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 xmm22, word ptr [rip]{1to8} + +// CHECK: vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f] + vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80] + vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vrsqrtpbf16 ymm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 ymm22, word ptr [rip]{1to16} + +// CHECK: vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f] + vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80] + vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00] + vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vrsqrtpbf16 zmm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00] + vrsqrtpbf16 zmm22, word ptr [rip]{1to32} + +// CHECK: vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff] + vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f] + vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80] + vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vscalefpbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x20,0x2c,0xf0] + vscalefpbf16 ymm22, ymm23, ymm24 + +// CHECK: vscalefpbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0x27,0x2c,0xf0] + vscalefpbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x2c,0xf0] + vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vscalefpbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x40,0x2c,0xf0] + vscalefpbf16 zmm22, zmm23, zmm24 + +// CHECK: vscalefpbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x47,0x2c,0xf0] + vscalefpbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x2c,0xf0] + vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vscalefpbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x00,0x2c,0xf0] + vscalefpbf16 xmm22, xmm23, xmm24 + +// CHECK: vscalefpbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x07,0x2c,0xf0] + vscalefpbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x86,0x44,0x87,0x2c,0xf0] + vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f] + vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80] + vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f] + vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80] + vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00] + vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00] + vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f] + vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x2c,0x72,0x80] + vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vsqrtnepbf16 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xf7] + vsqrtnepbf16 xmm22, xmm23 + +// CHECK: vsqrtnepbf16 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x51,0xf7] + vsqrtnepbf16 xmm22 {k7}, xmm23 + +// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x51,0xf7] + vsqrtnepbf16 xmm22 {k7} {z}, xmm23 + +// CHECK: vsqrtnepbf16 zmm22, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xf7] + vsqrtnepbf16 zmm22, zmm23 + +// CHECK: vsqrtnepbf16 zmm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x51,0xf7] + vsqrtnepbf16 zmm22 {k7}, zmm23 + +// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x51,0xf7] + vsqrtnepbf16 zmm22 {k7} {z}, zmm23 + +// CHECK: vsqrtnepbf16 ymm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xf7] + vsqrtnepbf16 ymm22, ymm23 + +// CHECK: vsqrtnepbf16 ymm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x51,0xf7] + vsqrtnepbf16 ymm22 {k7}, ymm23 + +// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x51,0xf7] + vsqrtnepbf16 ymm22 {k7} {z}, ymm23 + +// CHECK: vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vsqrtnepbf16 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 xmm22, word ptr [rip]{1to8} + +// CHECK: vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f] + vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80] + vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vsqrtnepbf16 ymm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 ymm22, word ptr [rip]{1to16} + +// CHECK: vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f] + vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80] + vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vsqrtnepbf16 zmm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00] + vsqrtnepbf16 zmm22, word ptr [rip]{1to32} + +// CHECK: vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff] + vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f] + vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80] + vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vsubnepbf16 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5c,0xf0] + vsubnepbf16 ymm22, ymm23, ymm24 + +// CHECK: vsubnepbf16 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5c,0xf0] + vsubnepbf16 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5c,0xf0] + vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vsubnepbf16 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5c,0xf0] + vsubnepbf16 zmm22, zmm23, zmm24 + +// CHECK: vsubnepbf16 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5c,0xf0] + vsubnepbf16 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5c,0xf0] + vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vsubnepbf16 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5c,0xf0] + vsubnepbf16 xmm22, xmm23, xmm24 + +// CHECK: vsubnepbf16 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5c,0xf0] + vsubnepbf16 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5c,0xf0] + vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff] + vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f] + vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80] + vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff] + vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f] + vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80] + vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00] + vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00] + vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff] + vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f] + vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5c,0x72,0x80] + vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index b88abbb461d08..286fb4904870c 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1176,6 +1176,8 @@ static const X86FoldTableEntry Table1[] = { {X86::VCOMISSZrr_Int, X86::VCOMISSZrm_Int, TB_NO_REVERSE}, {X86::VCOMISSrr, X86::VCOMISSrm, 0}, {X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE}, + {X86::VCOMSBF16Zrr, X86::VCOMSBF16Zrm, 0}, + {X86::VCOMSBF16Zrr_Int, X86::VCOMSBF16Zrm_Int, TB_NO_REVERSE}, {X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0}, {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE}, {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0}, @@ -1461,6 +1463,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VEXPANDPSZ128rr, X86::VEXPANDPSZ128rm, TB_NO_REVERSE}, {X86::VEXPANDPSZ256rr, X86::VEXPANDPSZ256rm, TB_NO_REVERSE}, {X86::VEXPANDPSZrr, X86::VEXPANDPSZrm, TB_NO_REVERSE}, + {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rm, 0}, + {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rm, 0}, + {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrm, 0}, {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0}, {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0}, {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0}, @@ -1479,6 +1484,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VFRCZPSrr, X86::VFRCZPSrm, 0}, {X86::VFRCZSDrr, X86::VFRCZSDrm, TB_NO_REVERSE}, {X86::VFRCZSSrr, X86::VFRCZSSrm, TB_NO_REVERSE}, + {X86::VGETEXPPBF16Z128r, X86::VGETEXPPBF16Z128m, 0}, + {X86::VGETEXPPBF16Z256r, X86::VGETEXPPBF16Z256m, 0}, + {X86::VGETEXPPBF16Zr, X86::VGETEXPPBF16Zm, 0}, {X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0}, {X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0}, {X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0}, @@ -1488,6 +1496,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0}, {X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0}, {X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0}, + {X86::VGETMANTPBF16Z128rri, X86::VGETMANTPBF16Z128rmi, 0}, + {X86::VGETMANTPBF16Z256rri, X86::VGETMANTPBF16Z256rmi, 0}, + {X86::VGETMANTPBF16Zrri, X86::VGETMANTPBF16Zrmi, 0}, {X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0}, {X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0}, {X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0}, @@ -1821,11 +1832,17 @@ static const X86FoldTableEntry Table1[] = { {X86::VRCP14PSZr, X86::VRCP14PSZm, 0}, {X86::VRCP28PDZr, X86::VRCP28PDZm, 0}, {X86::VRCP28PSZr, X86::VRCP28PSZm, 0}, + {X86::VRCPPBF16Z128r, X86::VRCPPBF16Z128m, 0}, + {X86::VRCPPBF16Z256r, X86::VRCPPBF16Z256m, 0}, + {X86::VRCPPBF16Zr, X86::VRCPPBF16Zm, 0}, {X86::VRCPPHZ128r, X86::VRCPPHZ128m, 0}, {X86::VRCPPHZ256r, X86::VRCPPHZ256m, 0}, {X86::VRCPPHZr, X86::VRCPPHZm, 0}, {X86::VRCPPSYr, X86::VRCPPSYm, 0}, {X86::VRCPPSr, X86::VRCPPSm, 0}, + {X86::VREDUCENEPBF16Z128rri, X86::VREDUCENEPBF16Z128rmi, 0}, + {X86::VREDUCENEPBF16Z256rri, X86::VREDUCENEPBF16Z256rmi, 0}, + {X86::VREDUCENEPBF16Zrri, X86::VREDUCENEPBF16Zrmi, 0}, {X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0}, {X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0}, {X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0}, @@ -1835,6 +1852,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0}, {X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0}, {X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0}, + {X86::VRNDSCALENEPBF16Z128rri, X86::VRNDSCALENEPBF16Z128rmi, 0}, + {X86::VRNDSCALENEPBF16Z256rri, X86::VRNDSCALENEPBF16Z256rmi, 0}, + {X86::VRNDSCALENEPBF16Zrri, X86::VRNDSCALENEPBF16Zrmi, 0}, {X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0}, {X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0}, {X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0}, @@ -1856,11 +1876,17 @@ static const X86FoldTableEntry Table1[] = { {X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0}, {X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0}, {X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0}, + {X86::VRSQRTPBF16Z128r, X86::VRSQRTPBF16Z128m, 0}, + {X86::VRSQRTPBF16Z256r, X86::VRSQRTPBF16Z256m, 0}, + {X86::VRSQRTPBF16Zr, X86::VRSQRTPBF16Zm, 0}, {X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128m, 0}, {X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256m, 0}, {X86::VRSQRTPHZr, X86::VRSQRTPHZm, 0}, {X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0}, {X86::VRSQRTPSr, X86::VRSQRTPSm, 0}, + {X86::VSQRTNEPBF16Z128r, X86::VSQRTNEPBF16Z128m, 0}, + {X86::VSQRTNEPBF16Z256r, X86::VSQRTNEPBF16Z256m, 0}, + {X86::VSQRTNEPBF16Zr, X86::VSQRTNEPBF16Zm, 0}, {X86::VSQRTPDYr, X86::VSQRTPDYm, 0}, {X86::VSQRTPDZ128r, X86::VSQRTPDZ128m, 0}, {X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0}, @@ -2335,6 +2361,9 @@ static const X86FoldTableEntry Table2[] = { {X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16}, {X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16}, {X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16}, + {X86::VADDNEPBF16Z128rr, X86::VADDNEPBF16Z128rm, 0}, + {X86::VADDNEPBF16Z256rr, X86::VADDNEPBF16Z256rm, 0}, + {X86::VADDNEPBF16Zrr, X86::VADDNEPBF16Zrm, 0}, {X86::VADDPDYrr, X86::VADDPDYrm, 0}, {X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0}, {X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0}, @@ -2432,6 +2461,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VBROADCASTSSZ128rrkz, X86::VBROADCASTSSZ128rmkz, TB_NO_REVERSE}, {X86::VBROADCASTSSZ256rrkz, X86::VBROADCASTSSZ256rmkz, TB_NO_REVERSE}, {X86::VBROADCASTSSZrrkz, X86::VBROADCASTSSZrmkz, TB_NO_REVERSE}, + {X86::VCMPPBF16Z128rri, X86::VCMPPBF16Z128rmi, 0}, + {X86::VCMPPBF16Z256rri, X86::VCMPPBF16Z256rmi, 0}, + {X86::VCMPPBF16Zrri, X86::VCMPPBF16Zrmi, 0}, {X86::VCMPPDYrri, X86::VCMPPDYrmi, 0}, {X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0}, {X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0}, @@ -2737,6 +2769,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VDBPSADBWZ128rri, X86::VDBPSADBWZ128rmi, 0}, {X86::VDBPSADBWZ256rri, X86::VDBPSADBWZ256rmi, 0}, {X86::VDBPSADBWZrri, X86::VDBPSADBWZrmi, 0}, + {X86::VDIVNEPBF16Z128rr, X86::VDIVNEPBF16Z128rm, 0}, + {X86::VDIVNEPBF16Z256rr, X86::VDIVNEPBF16Z256rm, 0}, + {X86::VDIVNEPBF16Zrr, X86::VDIVNEPBF16Zrm, 0}, {X86::VDIVPDYrr, X86::VDIVPDYrm, 0}, {X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0}, {X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0}, @@ -2819,6 +2854,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE}, {X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0}, {X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE}, + {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmk, 0}, + {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmk, 0}, + {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmk, 0}, {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0}, {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0}, {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0}, @@ -2831,6 +2869,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE}, {X86::VFPCLASSSHZrrk, X86::VFPCLASSSHZrmk, TB_NO_REVERSE}, {X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE}, + {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mkz, 0}, + {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mkz, 0}, + {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmkz, 0}, {X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0}, {X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0}, {X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0}, @@ -2843,6 +2884,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE}, {X86::VGETEXPSHZr, X86::VGETEXPSHZm, TB_NO_REVERSE}, {X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE}, + {X86::VGETMANTPBF16Z128rrikz, X86::VGETMANTPBF16Z128rmikz, 0}, + {X86::VGETMANTPBF16Z256rrikz, X86::VGETMANTPBF16Z256rmikz, 0}, + {X86::VGETMANTPBF16Zrrikz, X86::VGETMANTPBF16Zrmikz, 0}, {X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0}, {X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0}, {X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0}, @@ -2910,6 +2954,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VMAXCSHZrr, X86::VMAXCSHZrm, 0}, {X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0}, {X86::VMAXCSSrr, X86::VMAXCSSrm, 0}, + {X86::VMAXPBF16Z128rr, X86::VMAXPBF16Z128rm, 0}, + {X86::VMAXPBF16Z256rr, X86::VMAXPBF16Z256rm, 0}, + {X86::VMAXPBF16Zrr, X86::VMAXPBF16Zrm, 0}, {X86::VMAXPDYrr, X86::VMAXPDYrm, 0}, {X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0}, {X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0}, @@ -2966,6 +3013,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, TB_NO_REVERSE}, {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, TB_NO_REVERSE}, {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, TB_NO_REVERSE}, + {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rm, 0}, + {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rm, 0}, + {X86::VMINPBF16Zrr, X86::VMINPBF16Zrm, 0}, {X86::VMINPDYrr, X86::VMINPDYrm, 0}, {X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0}, {X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0}, @@ -3037,6 +3087,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VMPSADBWZ256rri, X86::VMPSADBWZ256rmi, 0}, {X86::VMPSADBWZrri, X86::VMPSADBWZrmi, 0}, {X86::VMPSADBWrri, X86::VMPSADBWrmi, 0}, + {X86::VMULNEPBF16Z128rr, X86::VMULNEPBF16Z128rm, 0}, + {X86::VMULNEPBF16Z256rr, X86::VMULNEPBF16Z256rm, 0}, + {X86::VMULNEPBF16Zrr, X86::VMULNEPBF16Zrm, 0}, {X86::VMULPDYrr, X86::VMULPDYrm, 0}, {X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0}, {X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0}, @@ -3887,12 +3940,18 @@ static const X86FoldTableEntry Table2[] = { {X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0}, {X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE}, {X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE}, + {X86::VRCPPBF16Z128rkz, X86::VRCPPBF16Z128mkz, 0}, + {X86::VRCPPBF16Z256rkz, X86::VRCPPBF16Z256mkz, 0}, + {X86::VRCPPBF16Zrkz, X86::VRCPPBF16Zmkz, 0}, {X86::VRCPPHZ128rkz, X86::VRCPPHZ128mkz, 0}, {X86::VRCPPHZ256rkz, X86::VRCPPHZ256mkz, 0}, {X86::VRCPPHZrkz, X86::VRCPPHZmkz, 0}, {X86::VRCPSHZrr, X86::VRCPSHZrm, TB_NO_REVERSE}, {X86::VRCPSSr, X86::VRCPSSm, 0}, {X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE}, + {X86::VREDUCENEPBF16Z128rrikz, X86::VREDUCENEPBF16Z128rmikz, 0}, + {X86::VREDUCENEPBF16Z256rrikz, X86::VREDUCENEPBF16Z256rmikz, 0}, + {X86::VREDUCENEPBF16Zrrikz, X86::VREDUCENEPBF16Zrmikz, 0}, {X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0}, {X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0}, {X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0}, @@ -3905,6 +3964,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE}, {X86::VREDUCESHZrri, X86::VREDUCESHZrmi, TB_NO_REVERSE}, {X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE}, + {X86::VRNDSCALENEPBF16Z128rrikz, X86::VRNDSCALENEPBF16Z128rmikz, 0}, + {X86::VRNDSCALENEPBF16Z256rrikz, X86::VRNDSCALENEPBF16Z256rmikz, 0}, + {X86::VRNDSCALENEPBF16Zrrikz, X86::VRNDSCALENEPBF16Zrmikz, 0}, {X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0}, {X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0}, {X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0}, @@ -3936,12 +3998,18 @@ static const X86FoldTableEntry Table2[] = { {X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0}, {X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE}, {X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE}, + {X86::VRSQRTPBF16Z128rkz, X86::VRSQRTPBF16Z128mkz, 0}, + {X86::VRSQRTPBF16Z256rkz, X86::VRSQRTPBF16Z256mkz, 0}, + {X86::VRSQRTPBF16Zrkz, X86::VRSQRTPBF16Zmkz, 0}, {X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mkz, 0}, {X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mkz, 0}, {X86::VRSQRTPHZrkz, X86::VRSQRTPHZmkz, 0}, {X86::VRSQRTSHZrr, X86::VRSQRTSHZrm, TB_NO_REVERSE}, {X86::VRSQRTSSr, X86::VRSQRTSSm, 0}, {X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE}, + {X86::VSCALEFPBF16Z128rr, X86::VSCALEFPBF16Z128rm, 0}, + {X86::VSCALEFPBF16Z256rr, X86::VSCALEFPBF16Z256rm, 0}, + {X86::VSCALEFPBF16Zrr, X86::VSCALEFPBF16Zrm, 0}, {X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0}, {X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0}, {X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0}, @@ -3976,6 +4044,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VSM4KEY4rr, X86::VSM4KEY4rm, 0}, {X86::VSM4RNDS4Yrr, X86::VSM4RNDS4Yrm, 0}, {X86::VSM4RNDS4rr, X86::VSM4RNDS4rm, 0}, + {X86::VSQRTNEPBF16Z128rkz, X86::VSQRTNEPBF16Z128mkz, 0}, + {X86::VSQRTNEPBF16Z256rkz, X86::VSQRTNEPBF16Z256mkz, 0}, + {X86::VSQRTNEPBF16Zrkz, X86::VSQRTNEPBF16Zmkz, 0}, {X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0}, {X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0}, {X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0}, @@ -3995,6 +4066,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE}, {X86::VSQRTSSr, X86::VSQRTSSm, 0}, {X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE}, + {X86::VSUBNEPBF16Z128rr, X86::VSUBNEPBF16Z128rm, 0}, + {X86::VSUBNEPBF16Z256rr, X86::VSUBNEPBF16Z256rm, 0}, + {X86::VSUBNEPBF16Zrr, X86::VSUBNEPBF16Zrm, 0}, {X86::VSUBPDYrr, X86::VSUBPDYrm, 0}, {X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0}, {X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0}, @@ -4069,6 +4143,9 @@ static const X86FoldTableEntry Table2[] = { }; static const X86FoldTableEntry Table3[] = { + {X86::VADDNEPBF16Z128rrkz, X86::VADDNEPBF16Z128rmkz, 0}, + {X86::VADDNEPBF16Z256rrkz, X86::VADDNEPBF16Z256rmkz, 0}, + {X86::VADDNEPBF16Zrrkz, X86::VADDNEPBF16Zrmkz, 0}, {X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0}, {X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0}, {X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0}, @@ -4115,6 +4192,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VBROADCASTSSZ128rrk, X86::VBROADCASTSSZ128rmk, TB_NO_REVERSE}, {X86::VBROADCASTSSZ256rrk, X86::VBROADCASTSSZ256rmk, TB_NO_REVERSE}, {X86::VBROADCASTSSZrrk, X86::VBROADCASTSSZrmk, TB_NO_REVERSE}, + {X86::VCMPPBF16Z128rrik, X86::VCMPPBF16Z128rmik, 0}, + {X86::VCMPPBF16Z256rrik, X86::VCMPPBF16Z256rmik, 0}, + {X86::VCMPPBF16Zrrik, X86::VCMPPBF16Zrmik, 0}, {X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0}, {X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0}, {X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0}, @@ -4367,6 +4447,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VDBPSADBWZ128rrikz, X86::VDBPSADBWZ128rmikz, 0}, {X86::VDBPSADBWZ256rrikz, X86::VDBPSADBWZ256rmikz, 0}, {X86::VDBPSADBWZrrikz, X86::VDBPSADBWZrmikz, 0}, + {X86::VDIVNEPBF16Z128rrkz, X86::VDIVNEPBF16Z128rmkz, 0}, + {X86::VDIVNEPBF16Z256rrkz, X86::VDIVNEPBF16Z256rmkz, 0}, + {X86::VDIVNEPBF16Zrrkz, X86::VDIVNEPBF16Zrmkz, 0}, {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0}, {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0}, {X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0}, @@ -4409,6 +4492,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmi, 0}, {X86::VFIXUPIMMSDZrri, X86::VFIXUPIMMSDZrmi, TB_NO_REVERSE}, {X86::VFIXUPIMMSSZrri, X86::VFIXUPIMMSSZrmi, TB_NO_REVERSE}, + {X86::VFMADD132NEPBF16Z128r, X86::VFMADD132NEPBF16Z128m, 0}, + {X86::VFMADD132NEPBF16Z256r, X86::VFMADD132NEPBF16Z256m, 0}, + {X86::VFMADD132NEPBF16Zr, X86::VFMADD132NEPBF16Zm, 0}, {X86::VFMADD132PDYr, X86::VFMADD132PDYm, 0}, {X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, 0}, {X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0}, @@ -4432,6 +4518,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE}, {X86::VFMADD132SSr, X86::VFMADD132SSm, 0}, {X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_NO_REVERSE}, + {X86::VFMADD213NEPBF16Z128r, X86::VFMADD213NEPBF16Z128m, 0}, + {X86::VFMADD213NEPBF16Z256r, X86::VFMADD213NEPBF16Z256m, 0}, + {X86::VFMADD213NEPBF16Zr, X86::VFMADD213NEPBF16Zm, 0}, {X86::VFMADD213PDYr, X86::VFMADD213PDYm, 0}, {X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, 0}, {X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0}, @@ -4455,6 +4544,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE}, {X86::VFMADD213SSr, X86::VFMADD213SSm, 0}, {X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_NO_REVERSE}, + {X86::VFMADD231NEPBF16Z128r, X86::VFMADD231NEPBF16Z128m, 0}, + {X86::VFMADD231NEPBF16Z256r, X86::VFMADD231NEPBF16Z256m, 0}, + {X86::VFMADD231NEPBF16Zr, X86::VFMADD231NEPBF16Zm, 0}, {X86::VFMADD231PDYr, X86::VFMADD231PDYm, 0}, {X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, 0}, {X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0}, @@ -4533,6 +4625,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, 0}, {X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, 0}, {X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, 0}, + {X86::VFMSUB132NEPBF16Z128r, X86::VFMSUB132NEPBF16Z128m, 0}, + {X86::VFMSUB132NEPBF16Z256r, X86::VFMSUB132NEPBF16Z256m, 0}, + {X86::VFMSUB132NEPBF16Zr, X86::VFMSUB132NEPBF16Zm, 0}, {X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, 0}, {X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, 0}, {X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0}, @@ -4556,6 +4651,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE}, {X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0}, {X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_NO_REVERSE}, + {X86::VFMSUB213NEPBF16Z128r, X86::VFMSUB213NEPBF16Z128m, 0}, + {X86::VFMSUB213NEPBF16Z256r, X86::VFMSUB213NEPBF16Z256m, 0}, + {X86::VFMSUB213NEPBF16Zr, X86::VFMSUB213NEPBF16Zm, 0}, {X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, 0}, {X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, 0}, {X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0}, @@ -4579,6 +4677,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE}, {X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0}, {X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_NO_REVERSE}, + {X86::VFMSUB231NEPBF16Z128r, X86::VFMSUB231NEPBF16Z128m, 0}, + {X86::VFMSUB231NEPBF16Z256r, X86::VFMSUB231NEPBF16Z256m, 0}, + {X86::VFMSUB231NEPBF16Zr, X86::VFMSUB231NEPBF16Zm, 0}, {X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, 0}, {X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, 0}, {X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0}, @@ -4657,6 +4758,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmkz, 0}, {X86::VFMULCPHZrrkz, X86::VFMULCPHZrmkz, 0}, {X86::VFMULCSHZrrkz, X86::VFMULCSHZrmkz, TB_NO_REVERSE}, + {X86::VFNMADD132NEPBF16Z128r, X86::VFNMADD132NEPBF16Z128m, 0}, + {X86::VFNMADD132NEPBF16Z256r, X86::VFNMADD132NEPBF16Z256m, 0}, + {X86::VFNMADD132NEPBF16Zr, X86::VFNMADD132NEPBF16Zm, 0}, {X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0}, {X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0}, {X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0}, @@ -4680,6 +4784,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE}, {X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0}, {X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_NO_REVERSE}, + {X86::VFNMADD213NEPBF16Z128r, X86::VFNMADD213NEPBF16Z128m, 0}, + {X86::VFNMADD213NEPBF16Z256r, X86::VFNMADD213NEPBF16Z256m, 0}, + {X86::VFNMADD213NEPBF16Zr, X86::VFNMADD213NEPBF16Zm, 0}, {X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, 0}, {X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, 0}, {X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0}, @@ -4703,6 +4810,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE}, {X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0}, {X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_NO_REVERSE}, + {X86::VFNMADD231NEPBF16Z128r, X86::VFNMADD231NEPBF16Z128m, 0}, + {X86::VFNMADD231NEPBF16Z256r, X86::VFNMADD231NEPBF16Z256m, 0}, + {X86::VFNMADD231NEPBF16Zr, X86::VFNMADD231NEPBF16Zm, 0}, {X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, 0}, {X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, 0}, {X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0}, @@ -4734,6 +4844,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE}, {X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0}, {X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132NEPBF16Z128r, X86::VFNMSUB132NEPBF16Z128m, 0}, + {X86::VFNMSUB132NEPBF16Z256r, X86::VFNMSUB132NEPBF16Z256m, 0}, + {X86::VFNMSUB132NEPBF16Zr, X86::VFNMSUB132NEPBF16Zm, 0}, {X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, 0}, {X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, 0}, {X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0}, @@ -4757,6 +4870,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE}, {X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0}, {X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213NEPBF16Z128r, X86::VFNMSUB213NEPBF16Z128m, 0}, + {X86::VFNMSUB213NEPBF16Z256r, X86::VFNMSUB213NEPBF16Z256m, 0}, + {X86::VFNMSUB213NEPBF16Zr, X86::VFNMSUB213NEPBF16Zm, 0}, {X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, 0}, {X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, 0}, {X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0}, @@ -4780,6 +4896,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE}, {X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0}, {X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231NEPBF16Z128r, X86::VFNMSUB231NEPBF16Z128m, 0}, + {X86::VFNMSUB231NEPBF16Z256r, X86::VFNMSUB231NEPBF16Z256m, 0}, + {X86::VFNMSUB231NEPBF16Zr, X86::VFNMSUB231NEPBF16Zm, 0}, {X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, 0}, {X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, 0}, {X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0}, @@ -4811,6 +4930,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE}, {X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0}, {X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE}, + {X86::VGETEXPPBF16Z128rk, X86::VGETEXPPBF16Z128mk, 0}, + {X86::VGETEXPPBF16Z256rk, X86::VGETEXPPBF16Z256mk, 0}, + {X86::VGETEXPPBF16Zrk, X86::VGETEXPPBF16Zmk, 0}, {X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0}, {X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0}, {X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0}, @@ -4823,6 +4945,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE}, {X86::VGETEXPSHZrkz, X86::VGETEXPSHZmkz, TB_NO_REVERSE}, {X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE}, + {X86::VGETMANTPBF16Z128rrik, X86::VGETMANTPBF16Z128rmik, 0}, + {X86::VGETMANTPBF16Z256rrik, X86::VGETMANTPBF16Z256rmik, 0}, + {X86::VGETMANTPBF16Zrrik, X86::VGETMANTPBF16Zrmik, 0}, {X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0}, {X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0}, {X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0}, @@ -4865,6 +4990,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0}, {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0}, {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0}, + {X86::VMAXPBF16Z128rrkz, X86::VMAXPBF16Z128rmkz, 0}, + {X86::VMAXPBF16Z256rrkz, X86::VMAXPBF16Z256rmkz, 0}, + {X86::VMAXPBF16Zrrkz, X86::VMAXPBF16Zrmkz, 0}, {X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0}, {X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0}, {X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0}, @@ -4901,6 +5029,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMINMAXSDrrikz, X86::VMINMAXSDrmikz, TB_NO_REVERSE}, {X86::VMINMAXSHrrikz, X86::VMINMAXSHrmikz, TB_NO_REVERSE}, {X86::VMINMAXSSrrikz, X86::VMINMAXSSrmikz, TB_NO_REVERSE}, + {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0}, + {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0}, + {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0}, {X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0}, {X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0}, {X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0}, @@ -4955,6 +5086,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMPSADBWZ128rrikz, X86::VMPSADBWZ128rmikz, 0}, {X86::VMPSADBWZ256rrikz, X86::VMPSADBWZ256rmikz, 0}, {X86::VMPSADBWZrrikz, X86::VMPSADBWZrmikz, 0}, + {X86::VMULNEPBF16Z128rrkz, X86::VMULNEPBF16Z128rmkz, 0}, + {X86::VMULNEPBF16Z256rrkz, X86::VMULNEPBF16Z256rmkz, 0}, + {X86::VMULNEPBF16Zrrkz, X86::VMULNEPBF16Zrmkz, 0}, {X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0}, {X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0}, {X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0}, @@ -5696,10 +5830,16 @@ static const X86FoldTableEntry Table3[] = { {X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0}, {X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE}, {X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE}, + {X86::VRCPPBF16Z128rk, X86::VRCPPBF16Z128mk, 0}, + {X86::VRCPPBF16Z256rk, X86::VRCPPBF16Z256mk, 0}, + {X86::VRCPPBF16Zrk, X86::VRCPPBF16Zmk, 0}, {X86::VRCPPHZ128rk, X86::VRCPPHZ128mk, 0}, {X86::VRCPPHZ256rk, X86::VRCPPHZ256mk, 0}, {X86::VRCPPHZrk, X86::VRCPPHZmk, 0}, {X86::VRCPSHZrrkz, X86::VRCPSHZrmkz, TB_NO_REVERSE}, + {X86::VREDUCENEPBF16Z128rrik, X86::VREDUCENEPBF16Z128rmik, 0}, + {X86::VREDUCENEPBF16Z256rrik, X86::VREDUCENEPBF16Z256rmik, 0}, + {X86::VREDUCENEPBF16Zrrik, X86::VREDUCENEPBF16Zrmik, 0}, {X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0}, {X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0}, {X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0}, @@ -5712,6 +5852,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE}, {X86::VREDUCESHZrrikz, X86::VREDUCESHZrmikz, TB_NO_REVERSE}, {X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE}, + {X86::VRNDSCALENEPBF16Z128rrik, X86::VRNDSCALENEPBF16Z128rmik, 0}, + {X86::VRNDSCALENEPBF16Z256rrik, X86::VRNDSCALENEPBF16Z256rmik, 0}, + {X86::VRNDSCALENEPBF16Zrrik, X86::VRNDSCALENEPBF16Zrmik, 0}, {X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0}, {X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0}, {X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0}, @@ -5736,10 +5879,16 @@ static const X86FoldTableEntry Table3[] = { {X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0}, {X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE}, {X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE}, + {X86::VRSQRTPBF16Z128rk, X86::VRSQRTPBF16Z128mk, 0}, + {X86::VRSQRTPBF16Z256rk, X86::VRSQRTPBF16Z256mk, 0}, + {X86::VRSQRTPBF16Zrk, X86::VRSQRTPBF16Zmk, 0}, {X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mk, 0}, {X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mk, 0}, {X86::VRSQRTPHZrk, X86::VRSQRTPHZmk, 0}, {X86::VRSQRTSHZrrkz, X86::VRSQRTSHZrmkz, TB_NO_REVERSE}, + {X86::VSCALEFPBF16Z128rrkz, X86::VSCALEFPBF16Z128rmkz, 0}, + {X86::VSCALEFPBF16Z256rrkz, X86::VSCALEFPBF16Z256rmkz, 0}, + {X86::VSCALEFPBF16Zrrkz, X86::VSCALEFPBF16Zrmkz, 0}, {X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0}, {X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0}, {X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0}, @@ -5769,6 +5918,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VSM3MSG1rr, X86::VSM3MSG1rm, 0}, {X86::VSM3MSG2rr, X86::VSM3MSG2rm, 0}, {X86::VSM3RNDS2rr, X86::VSM3RNDS2rm, 0}, + {X86::VSQRTNEPBF16Z128rk, X86::VSQRTNEPBF16Z128mk, 0}, + {X86::VSQRTNEPBF16Z256rk, X86::VSQRTNEPBF16Z256mk, 0}, + {X86::VSQRTNEPBF16Zrk, X86::VSQRTNEPBF16Zmk, 0}, {X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0}, {X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0}, {X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0}, @@ -5781,6 +5933,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE}, {X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE}, {X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE}, + {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmkz, 0}, + {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmkz, 0}, + {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmkz, 0}, {X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0}, {X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0}, {X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0}, @@ -5814,6 +5969,9 @@ static const X86FoldTableEntry Table3[] = { }; static const X86FoldTableEntry Table4[] = { + {X86::VADDNEPBF16Z128rrk, X86::VADDNEPBF16Z128rmk, 0}, + {X86::VADDNEPBF16Z256rrk, X86::VADDNEPBF16Z256rmk, 0}, + {X86::VADDNEPBF16Zrrk, X86::VADDNEPBF16Zrmk, 0}, {X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0}, {X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0}, {X86::VADDPDZrrk, X86::VADDPDZrmk, 0}, @@ -5883,6 +6041,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0}, {X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0}, {X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0}, + {X86::VDIVNEPBF16Z128rrk, X86::VDIVNEPBF16Z128rmk, 0}, + {X86::VDIVNEPBF16Z256rrk, X86::VDIVNEPBF16Z256rmk, 0}, + {X86::VDIVNEPBF16Zrrk, X86::VDIVNEPBF16Zrmk, 0}, {X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0}, {X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0}, {X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0}, @@ -5935,6 +6096,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFIXUPIMMSDZrrikz, X86::VFIXUPIMMSDZrmikz, TB_NO_REVERSE}, {X86::VFIXUPIMMSSZrrik, X86::VFIXUPIMMSSZrmik, TB_NO_REVERSE}, {X86::VFIXUPIMMSSZrrikz, X86::VFIXUPIMMSSZrmikz, TB_NO_REVERSE}, + {X86::VFMADD132NEPBF16Z128rk, X86::VFMADD132NEPBF16Z128mk, 0}, + {X86::VFMADD132NEPBF16Z128rkz, X86::VFMADD132NEPBF16Z128mkz, 0}, + {X86::VFMADD132NEPBF16Z256rk, X86::VFMADD132NEPBF16Z256mk, 0}, + {X86::VFMADD132NEPBF16Z256rkz, X86::VFMADD132NEPBF16Z256mkz, 0}, + {X86::VFMADD132NEPBF16Zrk, X86::VFMADD132NEPBF16Zmk, 0}, + {X86::VFMADD132NEPBF16Zrkz, X86::VFMADD132NEPBF16Zmkz, 0}, {X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mk, 0}, {X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mkz, 0}, {X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mk, 0}, @@ -5959,6 +6126,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE}, {X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE}, {X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mk, 0}, + {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mkz, 0}, + {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mk, 0}, + {X86::VFMADD213NEPBF16Z256rkz, X86::VFMADD213NEPBF16Z256mkz, 0}, + {X86::VFMADD213NEPBF16Zrk, X86::VFMADD213NEPBF16Zmk, 0}, + {X86::VFMADD213NEPBF16Zrkz, X86::VFMADD213NEPBF16Zmkz, 0}, {X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0}, {X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mkz, 0}, {X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mk, 0}, @@ -5983,6 +6156,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE}, {X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE}, {X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mk, 0}, + {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mkz, 0}, + {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mk, 0}, + {X86::VFMADD231NEPBF16Z256rkz, X86::VFMADD231NEPBF16Z256mkz, 0}, + {X86::VFMADD231NEPBF16Zrk, X86::VFMADD231NEPBF16Zmk, 0}, + {X86::VFMADD231NEPBF16Zrkz, X86::VFMADD231NEPBF16Zmkz, 0}, {X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0}, {X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mkz, 0}, {X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mk, 0}, @@ -6069,6 +6248,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mkz, 0}, {X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmk, 0}, {X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmkz, 0}, + {X86::VFMSUB132NEPBF16Z128rk, X86::VFMSUB132NEPBF16Z128mk, 0}, + {X86::VFMSUB132NEPBF16Z128rkz, X86::VFMSUB132NEPBF16Z128mkz, 0}, + {X86::VFMSUB132NEPBF16Z256rk, X86::VFMSUB132NEPBF16Z256mk, 0}, + {X86::VFMSUB132NEPBF16Z256rkz, X86::VFMSUB132NEPBF16Z256mkz, 0}, + {X86::VFMSUB132NEPBF16Zrk, X86::VFMSUB132NEPBF16Zmk, 0}, + {X86::VFMSUB132NEPBF16Zrkz, X86::VFMSUB132NEPBF16Zmkz, 0}, {X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mk, 0}, {X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mkz, 0}, {X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mk, 0}, @@ -6093,6 +6278,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE}, {X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE}, {X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mk, 0}, + {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mkz, 0}, + {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mk, 0}, + {X86::VFMSUB213NEPBF16Z256rkz, X86::VFMSUB213NEPBF16Z256mkz, 0}, + {X86::VFMSUB213NEPBF16Zrk, X86::VFMSUB213NEPBF16Zmk, 0}, + {X86::VFMSUB213NEPBF16Zrkz, X86::VFMSUB213NEPBF16Zmkz, 0}, {X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0}, {X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mkz, 0}, {X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mk, 0}, @@ -6117,6 +6308,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE}, {X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE}, {X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mk, 0}, + {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mkz, 0}, + {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mk, 0}, + {X86::VFMSUB231NEPBF16Z256rkz, X86::VFMSUB231NEPBF16Z256mkz, 0}, + {X86::VFMSUB231NEPBF16Zrk, X86::VFMSUB231NEPBF16Zmk, 0}, + {X86::VFMSUB231NEPBF16Zrkz, X86::VFMSUB231NEPBF16Zmkz, 0}, {X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0}, {X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mkz, 0}, {X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mk, 0}, @@ -6199,6 +6396,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmk, 0}, {X86::VFMULCPHZrrk, X86::VFMULCPHZrmk, 0}, {X86::VFMULCSHZrrk, X86::VFMULCSHZrmk, TB_NO_REVERSE}, + {X86::VFNMADD132NEPBF16Z128rk, X86::VFNMADD132NEPBF16Z128mk, 0}, + {X86::VFNMADD132NEPBF16Z128rkz, X86::VFNMADD132NEPBF16Z128mkz, 0}, + {X86::VFNMADD132NEPBF16Z256rk, X86::VFNMADD132NEPBF16Z256mk, 0}, + {X86::VFNMADD132NEPBF16Z256rkz, X86::VFNMADD132NEPBF16Z256mkz, 0}, + {X86::VFNMADD132NEPBF16Zrk, X86::VFNMADD132NEPBF16Zmk, 0}, + {X86::VFNMADD132NEPBF16Zrkz, X86::VFNMADD132NEPBF16Zmkz, 0}, {X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0}, {X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0}, {X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0}, @@ -6223,6 +6426,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE}, {X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE}, {X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mk, 0}, + {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mkz, 0}, + {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mk, 0}, + {X86::VFNMADD213NEPBF16Z256rkz, X86::VFNMADD213NEPBF16Z256mkz, 0}, + {X86::VFNMADD213NEPBF16Zrk, X86::VFNMADD213NEPBF16Zmk, 0}, + {X86::VFNMADD213NEPBF16Zrkz, X86::VFNMADD213NEPBF16Zmkz, 0}, {X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0}, {X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mkz, 0}, {X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mk, 0}, @@ -6247,6 +6456,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE}, {X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE}, {X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mk, 0}, + {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mkz, 0}, + {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mk, 0}, + {X86::VFNMADD231NEPBF16Z256rkz, X86::VFNMADD231NEPBF16Z256mkz, 0}, + {X86::VFNMADD231NEPBF16Zrk, X86::VFNMADD231NEPBF16Zmk, 0}, + {X86::VFNMADD231NEPBF16Zrkz, X86::VFNMADD231NEPBF16Zmkz, 0}, {X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0}, {X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mkz, 0}, {X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mk, 0}, @@ -6271,6 +6486,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE}, {X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE}, {X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mk, 0}, + {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mkz, 0}, + {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mk, 0}, + {X86::VFNMSUB132NEPBF16Z256rkz, X86::VFNMSUB132NEPBF16Z256mkz, 0}, + {X86::VFNMSUB132NEPBF16Zrk, X86::VFNMSUB132NEPBF16Zmk, 0}, + {X86::VFNMSUB132NEPBF16Zrkz, X86::VFNMSUB132NEPBF16Zmkz, 0}, {X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0}, {X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mkz, 0}, {X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mk, 0}, @@ -6295,6 +6516,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE}, {X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE}, {X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mk, 0}, + {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mkz, 0}, + {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mk, 0}, + {X86::VFNMSUB213NEPBF16Z256rkz, X86::VFNMSUB213NEPBF16Z256mkz, 0}, + {X86::VFNMSUB213NEPBF16Zrk, X86::VFNMSUB213NEPBF16Zmk, 0}, + {X86::VFNMSUB213NEPBF16Zrkz, X86::VFNMSUB213NEPBF16Zmkz, 0}, {X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0}, {X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mkz, 0}, {X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mk, 0}, @@ -6319,6 +6546,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE}, {X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE}, {X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mk, 0}, + {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mkz, 0}, + {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mk, 0}, + {X86::VFNMSUB231NEPBF16Z256rkz, X86::VFNMSUB231NEPBF16Z256mkz, 0}, + {X86::VFNMSUB231NEPBF16Zrk, X86::VFNMSUB231NEPBF16Zmk, 0}, + {X86::VFNMSUB231NEPBF16Zrkz, X86::VFNMSUB231NEPBF16Zmkz, 0}, {X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0}, {X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mkz, 0}, {X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mk, 0}, @@ -6379,6 +6612,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0}, {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0}, {X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0}, + {X86::VMAXPBF16Z128rrk, X86::VMAXPBF16Z128rmk, 0}, + {X86::VMAXPBF16Z256rrk, X86::VMAXPBF16Z256rmk, 0}, + {X86::VMAXPBF16Zrrk, X86::VMAXPBF16Zrmk, 0}, {X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0}, {X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0}, {X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0}, @@ -6415,6 +6651,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMINMAXSDrrik, X86::VMINMAXSDrmik, TB_NO_REVERSE}, {X86::VMINMAXSHrrik, X86::VMINMAXSHrmik, TB_NO_REVERSE}, {X86::VMINMAXSSrrik, X86::VMINMAXSSrmik, TB_NO_REVERSE}, + {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0}, + {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0}, + {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0}, {X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0}, {X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0}, {X86::VMINPDZrrk, X86::VMINPDZrmk, 0}, @@ -6430,6 +6669,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMPSADBWZ128rrik, X86::VMPSADBWZ128rmik, 0}, {X86::VMPSADBWZ256rrik, X86::VMPSADBWZ256rmik, 0}, {X86::VMPSADBWZrrik, X86::VMPSADBWZrmik, 0}, + {X86::VMULNEPBF16Z128rrk, X86::VMULNEPBF16Z128rmk, 0}, + {X86::VMULNEPBF16Z256rrk, X86::VMULNEPBF16Z256rmk, 0}, + {X86::VMULNEPBF16Zrrk, X86::VMULNEPBF16Zrmk, 0}, {X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0}, {X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0}, {X86::VMULPDZrrk, X86::VMULPDZrmk, 0}, @@ -7005,6 +7247,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE}, {X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE}, {X86::VRSQRTSHZrrk, X86::VRSQRTSHZrmk, TB_NO_REVERSE}, + {X86::VSCALEFPBF16Z128rrk, X86::VSCALEFPBF16Z128rmk, 0}, + {X86::VSCALEFPBF16Z256rrk, X86::VSCALEFPBF16Z256rmk, 0}, + {X86::VSCALEFPBF16Zrrk, X86::VSCALEFPBF16Zrmk, 0}, {X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0}, {X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0}, {X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0}, @@ -7034,6 +7279,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE}, {X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE}, {X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE}, + {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmk, 0}, + {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmk, 0}, + {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmk, 0}, {X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0}, {X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0}, {X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0}, @@ -7264,6 +7512,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_W}, {X86::VEXP2PDZr, X86::VEXP2PDZmb, TB_BCAST_SD}, {X86::VEXP2PSZr, X86::VEXP2PSZmb, TB_BCAST_SS}, + {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rmb, TB_BCAST_SH}, + {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rmb, TB_BCAST_SH}, + {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrmb, TB_BCAST_SH}, {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rmb, TB_BCAST_SD}, {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rmb, TB_BCAST_SD}, {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrmb, TB_BCAST_SD}, @@ -7273,6 +7524,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rmb, TB_BCAST_SS}, {X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rmb, TB_BCAST_SS}, {X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrmb, TB_BCAST_SS}, + {X86::VGETEXPPBF16Z128r, X86::VGETEXPPBF16Z128mb, TB_BCAST_SH}, + {X86::VGETEXPPBF16Z256r, X86::VGETEXPPBF16Z256mb, TB_BCAST_SH}, + {X86::VGETEXPPBF16Zr, X86::VGETEXPPBF16Zmb, TB_BCAST_SH}, {X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128mb, TB_BCAST_SD}, {X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256mb, TB_BCAST_SD}, {X86::VGETEXPPDZr, X86::VGETEXPPDZmb, TB_BCAST_SD}, @@ -7282,6 +7536,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128mb, TB_BCAST_SS}, {X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256mb, TB_BCAST_SS}, {X86::VGETEXPPSZr, X86::VGETEXPPSZmb, TB_BCAST_SS}, + {X86::VGETMANTPBF16Z128rri, X86::VGETMANTPBF16Z128rmbi, TB_BCAST_SH}, + {X86::VGETMANTPBF16Z256rri, X86::VGETMANTPBF16Z256rmbi, TB_BCAST_SH}, + {X86::VGETMANTPBF16Zrri, X86::VGETMANTPBF16Zrmbi, TB_BCAST_SH}, {X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmbi, TB_BCAST_SD}, {X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmbi, TB_BCAST_SD}, {X86::VGETMANTPDZrri, X86::VGETMANTPDZrmbi, TB_BCAST_SD}, @@ -7366,9 +7623,15 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VRCP14PSZr, X86::VRCP14PSZmb, TB_BCAST_SS}, {X86::VRCP28PDZr, X86::VRCP28PDZmb, TB_BCAST_SD}, {X86::VRCP28PSZr, X86::VRCP28PSZmb, TB_BCAST_SS}, + {X86::VRCPPBF16Z128r, X86::VRCPPBF16Z128mb, TB_BCAST_SH}, + {X86::VRCPPBF16Z256r, X86::VRCPPBF16Z256mb, TB_BCAST_SH}, + {X86::VRCPPBF16Zr, X86::VRCPPBF16Zmb, TB_BCAST_SH}, {X86::VRCPPHZ128r, X86::VRCPPHZ128mb, TB_BCAST_SH}, {X86::VRCPPHZ256r, X86::VRCPPHZ256mb, TB_BCAST_SH}, {X86::VRCPPHZr, X86::VRCPPHZmb, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z128rri, X86::VREDUCENEPBF16Z128rmbi, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z256rri, X86::VREDUCENEPBF16Z256rmbi, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Zrri, X86::VREDUCENEPBF16Zrmbi, TB_BCAST_SH}, {X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmbi, TB_BCAST_SD}, {X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmbi, TB_BCAST_SD}, {X86::VREDUCEPDZrri, X86::VREDUCEPDZrmbi, TB_BCAST_SD}, @@ -7378,6 +7641,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmbi, TB_BCAST_SS}, {X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmbi, TB_BCAST_SS}, {X86::VREDUCEPSZrri, X86::VREDUCEPSZrmbi, TB_BCAST_SS}, + {X86::VRNDSCALENEPBF16Z128rri, X86::VRNDSCALENEPBF16Z128rmbi, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Z256rri, X86::VRNDSCALENEPBF16Z256rmbi, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Zrri, X86::VRNDSCALENEPBF16Zrmbi, TB_BCAST_SH}, {X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmbi, TB_BCAST_SD}, {X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmbi, TB_BCAST_SD}, {X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmbi, TB_BCAST_SD}, @@ -7395,9 +7661,15 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VRSQRT14PSZr, X86::VRSQRT14PSZmb, TB_BCAST_SS}, {X86::VRSQRT28PDZr, X86::VRSQRT28PDZmb, TB_BCAST_SD}, {X86::VRSQRT28PSZr, X86::VRSQRT28PSZmb, TB_BCAST_SS}, + {X86::VRSQRTPBF16Z128r, X86::VRSQRTPBF16Z128mb, TB_BCAST_SH}, + {X86::VRSQRTPBF16Z256r, X86::VRSQRTPBF16Z256mb, TB_BCAST_SH}, + {X86::VRSQRTPBF16Zr, X86::VRSQRTPBF16Zmb, TB_BCAST_SH}, {X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128mb, TB_BCAST_SH}, {X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256mb, TB_BCAST_SH}, {X86::VRSQRTPHZr, X86::VRSQRTPHZmb, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Z128r, X86::VSQRTNEPBF16Z128mb, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Z256r, X86::VSQRTNEPBF16Z256mb, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Zr, X86::VSQRTNEPBF16Zmb, TB_BCAST_SH}, {X86::VSQRTPDZ128r, X86::VSQRTPDZ128mb, TB_BCAST_SD}, {X86::VSQRTPDZ256r, X86::VSQRTPDZ256mb, TB_BCAST_SD}, {X86::VSQRTPDZr, X86::VSQRTPDZmb, TB_BCAST_SD}, @@ -7410,6 +7682,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { }; static const X86FoldTableEntry BroadcastTable2[] = { + {X86::VADDNEPBF16Z128rr, X86::VADDNEPBF16Z128rmb, TB_BCAST_SH}, + {X86::VADDNEPBF16Z256rr, X86::VADDNEPBF16Z256rmb, TB_BCAST_SH}, + {X86::VADDNEPBF16Zrr, X86::VADDNEPBF16Zrmb, TB_BCAST_SH}, {X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD}, {X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD}, {X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD}, @@ -7443,6 +7718,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VBLENDMPSZ128rr, X86::VBLENDMPSZ128rmb, TB_BCAST_SS}, {X86::VBLENDMPSZ256rr, X86::VBLENDMPSZ256rmb, TB_BCAST_SS}, {X86::VBLENDMPSZrr, X86::VBLENDMPSZrmb, TB_BCAST_SS}, + {X86::VCMPPBF16Z128rri, X86::VCMPPBF16Z128rmbi, TB_BCAST_SH}, + {X86::VCMPPBF16Z256rri, X86::VCMPPBF16Z256rmbi, TB_BCAST_SH}, + {X86::VCMPPBF16Zrri, X86::VCMPPBF16Zrmbi, TB_BCAST_SH}, {X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD}, {X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD}, {X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD}, @@ -7677,6 +7955,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_W}, {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_W}, {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_W}, + {X86::VDIVNEPBF16Z128rr, X86::VDIVNEPBF16Z128rmb, TB_BCAST_SH}, + {X86::VDIVNEPBF16Z256rr, X86::VDIVNEPBF16Z256rmb, TB_BCAST_SH}, + {X86::VDIVNEPBF16Zrr, X86::VDIVNEPBF16Zrmb, TB_BCAST_SH}, {X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD}, {X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD}, {X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD}, @@ -7694,6 +7975,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VFMULCPHZ128rr, X86::VFMULCPHZ128rmb, TB_BCAST_SS}, {X86::VFMULCPHZ256rr, X86::VFMULCPHZ256rmb, TB_BCAST_SS}, {X86::VFMULCPHZrr, X86::VFMULCPHZrmb, TB_BCAST_SS}, + {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmbk, TB_BCAST_SH}, {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmbk, TB_BCAST_SD}, {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmbk, TB_BCAST_SD}, {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmbk, TB_BCAST_SD}, @@ -7703,6 +7987,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmbk, TB_BCAST_SS}, {X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmbk, TB_BCAST_SS}, {X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmbk, TB_BCAST_SS}, + {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmbkz, TB_BCAST_SH}, {X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mbkz, TB_BCAST_SD}, {X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mbkz, TB_BCAST_SD}, {X86::VGETEXPPDZrkz, X86::VGETEXPPDZmbkz, TB_BCAST_SD}, @@ -7712,6 +7999,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mbkz, TB_BCAST_SS}, {X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mbkz, TB_BCAST_SS}, {X86::VGETEXPPSZrkz, X86::VGETEXPPSZmbkz, TB_BCAST_SS}, + {X86::VGETMANTPBF16Z128rrikz, X86::VGETMANTPBF16Z128rmbikz, TB_BCAST_SH}, + {X86::VGETMANTPBF16Z256rrikz, X86::VGETMANTPBF16Z256rmbikz, TB_BCAST_SH}, + {X86::VGETMANTPBF16Zrrikz, X86::VGETMANTPBF16Zrmbikz, TB_BCAST_SH}, {X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmbikz, TB_BCAST_SD}, {X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmbikz, TB_BCAST_SD}, {X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmbikz, TB_BCAST_SD}, @@ -7736,6 +8026,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS}, {X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS}, {X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS}, + {X86::VMAXPBF16Z128rr, X86::VMAXPBF16Z128rmb, TB_BCAST_SH}, + {X86::VMAXPBF16Z256rr, X86::VMAXPBF16Z256rmb, TB_BCAST_SH}, + {X86::VMAXPBF16Zrr, X86::VMAXPBF16Zrmb, TB_BCAST_SH}, {X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD}, {X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD}, {X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD}, @@ -7766,6 +8059,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VMINMAXPSZ128rri, X86::VMINMAXPSZ128rmbi, TB_BCAST_SS}, {X86::VMINMAXPSZ256rri, X86::VMINMAXPSZ256rmbi, TB_BCAST_SS}, {X86::VMINMAXPSZrri, X86::VMINMAXPSZrmbi, TB_BCAST_SS}, + {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rmb, TB_BCAST_SH}, + {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rmb, TB_BCAST_SH}, + {X86::VMINPBF16Zrr, X86::VMINPBF16Zrmb, TB_BCAST_SH}, {X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD}, {X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD}, {X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD}, @@ -7775,6 +8071,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS}, {X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS}, {X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS}, + {X86::VMULNEPBF16Z128rr, X86::VMULNEPBF16Z128rmb, TB_BCAST_SH}, + {X86::VMULNEPBF16Z256rr, X86::VMULNEPBF16Z256rmb, TB_BCAST_SH}, + {X86::VMULNEPBF16Zrr, X86::VMULNEPBF16Zrmb, TB_BCAST_SH}, {X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD}, {X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD}, {X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD}, @@ -8068,9 +8367,15 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VRCP14PSZrkz, X86::VRCP14PSZmbkz, TB_BCAST_SS}, {X86::VRCP28PDZrkz, X86::VRCP28PDZmbkz, TB_BCAST_SD}, {X86::VRCP28PSZrkz, X86::VRCP28PSZmbkz, TB_BCAST_SS}, + {X86::VRCPPBF16Z128rkz, X86::VRCPPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VRCPPBF16Z256rkz, X86::VRCPPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VRCPPBF16Zrkz, X86::VRCPPBF16Zmbkz, TB_BCAST_SH}, {X86::VRCPPHZ128rkz, X86::VRCPPHZ128mbkz, TB_BCAST_SH}, {X86::VRCPPHZ256rkz, X86::VRCPPHZ256mbkz, TB_BCAST_SH}, {X86::VRCPPHZrkz, X86::VRCPPHZmbkz, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z128rrikz, X86::VREDUCENEPBF16Z128rmbikz, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z256rrikz, X86::VREDUCENEPBF16Z256rmbikz, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Zrrikz, X86::VREDUCENEPBF16Zrmbikz, TB_BCAST_SH}, {X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmbikz, TB_BCAST_SD}, {X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmbikz, TB_BCAST_SD}, {X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmbikz, TB_BCAST_SD}, @@ -8080,6 +8385,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmbikz, TB_BCAST_SS}, {X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmbikz, TB_BCAST_SS}, {X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmbikz, TB_BCAST_SS}, + {X86::VRNDSCALENEPBF16Z128rrikz, X86::VRNDSCALENEPBF16Z128rmbikz, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Z256rrikz, X86::VRNDSCALENEPBF16Z256rmbikz, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Zrrikz, X86::VRNDSCALENEPBF16Zrmbikz, TB_BCAST_SH}, {X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmbikz, TB_BCAST_SD}, {X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmbikz, TB_BCAST_SD}, {X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmbikz, TB_BCAST_SD}, @@ -8097,9 +8405,15 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VRSQRT14PSZrkz, X86::VRSQRT14PSZmbkz, TB_BCAST_SS}, {X86::VRSQRT28PDZrkz, X86::VRSQRT28PDZmbkz, TB_BCAST_SD}, {X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmbkz, TB_BCAST_SS}, + {X86::VRSQRTPBF16Z128rkz, X86::VRSQRTPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VRSQRTPBF16Z256rkz, X86::VRSQRTPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VRSQRTPBF16Zrkz, X86::VRSQRTPBF16Zmbkz, TB_BCAST_SH}, {X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mbkz, TB_BCAST_SH}, {X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mbkz, TB_BCAST_SH}, {X86::VRSQRTPHZrkz, X86::VRSQRTPHZmbkz, TB_BCAST_SH}, + {X86::VSCALEFPBF16Z128rr, X86::VSCALEFPBF16Z128rmb, TB_BCAST_SH}, + {X86::VSCALEFPBF16Z256rr, X86::VSCALEFPBF16Z256rmb, TB_BCAST_SH}, + {X86::VSCALEFPBF16Zrr, X86::VSCALEFPBF16Zrmb, TB_BCAST_SH}, {X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rmb, TB_BCAST_SD}, {X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rmb, TB_BCAST_SD}, {X86::VSCALEFPDZrr, X86::VSCALEFPDZrmb, TB_BCAST_SD}, @@ -8123,6 +8437,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmbi, TB_BCAST_SS}, {X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmbi, TB_BCAST_SS}, {X86::VSHUFPSZrri, X86::VSHUFPSZrmbi, TB_BCAST_SS}, + {X86::VSQRTNEPBF16Z128rkz, X86::VSQRTNEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Z256rkz, X86::VSQRTNEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Zrkz, X86::VSQRTNEPBF16Zmbkz, TB_BCAST_SH}, {X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mbkz, TB_BCAST_SD}, {X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mbkz, TB_BCAST_SD}, {X86::VSQRTPDZrkz, X86::VSQRTPDZmbkz, TB_BCAST_SD}, @@ -8132,6 +8449,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mbkz, TB_BCAST_SS}, {X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mbkz, TB_BCAST_SS}, {X86::VSQRTPSZrkz, X86::VSQRTPSZmbkz, TB_BCAST_SS}, + {X86::VSUBNEPBF16Z128rr, X86::VSUBNEPBF16Z128rmb, TB_BCAST_SH}, + {X86::VSUBNEPBF16Z256rr, X86::VSUBNEPBF16Z256rmb, TB_BCAST_SH}, + {X86::VSUBNEPBF16Zrr, X86::VSUBNEPBF16Zrmb, TB_BCAST_SH}, {X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD}, {X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD}, {X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD}, @@ -8162,6 +8482,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { }; static const X86FoldTableEntry BroadcastTable3[] = { + {X86::VADDNEPBF16Z128rrkz, X86::VADDNEPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VADDNEPBF16Z256rrkz, X86::VADDNEPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VADDNEPBF16Zrrkz, X86::VADDNEPBF16Zrmbkz, TB_BCAST_SH}, {X86::VADDPDZ128rrkz, X86::VADDPDZ128rmbkz, TB_BCAST_SD}, {X86::VADDPDZ256rrkz, X86::VADDPDZ256rmbkz, TB_BCAST_SD}, {X86::VADDPDZrrkz, X86::VADDPDZrmbkz, TB_BCAST_SD}, @@ -8195,6 +8518,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmbk, TB_BCAST_SS}, {X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmbk, TB_BCAST_SS}, {X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmbk, TB_BCAST_SS}, + {X86::VCMPPBF16Z128rrik, X86::VCMPPBF16Z128rmbik, TB_BCAST_SH}, + {X86::VCMPPBF16Z256rrik, X86::VCMPPBF16Z256rmbik, TB_BCAST_SH}, + {X86::VCMPPBF16Zrrik, X86::VCMPPBF16Zrmbik, TB_BCAST_SH}, {X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmbik, TB_BCAST_SD}, {X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmbik, TB_BCAST_SD}, {X86::VCMPPDZrrik, X86::VCMPPDZrmbik, TB_BCAST_SD}, @@ -8429,6 +8755,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_W}, {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_W}, {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_W}, + {X86::VDIVNEPBF16Z128rrkz, X86::VDIVNEPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VDIVNEPBF16Z256rrkz, X86::VDIVNEPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VDIVNEPBF16Zrrkz, X86::VDIVNEPBF16Zrmbkz, TB_BCAST_SH}, {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmbkz, TB_BCAST_SD}, {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmbkz, TB_BCAST_SD}, {X86::VDIVPDZrrkz, X86::VDIVPDZrmbkz, TB_BCAST_SD}, @@ -8458,6 +8787,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFIXUPIMMPSZ128rri, X86::VFIXUPIMMPSZ128rmbi, TB_BCAST_SS}, {X86::VFIXUPIMMPSZ256rri, X86::VFIXUPIMMPSZ256rmbi, TB_BCAST_SS}, {X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmbi, TB_BCAST_SS}, + {X86::VFMADD132NEPBF16Z128r, X86::VFMADD132NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Z256r, X86::VFMADD132NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Zr, X86::VFMADD132NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD}, {X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD}, {X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD}, @@ -8467,6 +8799,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS}, {X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS}, {X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS}, + {X86::VFMADD213NEPBF16Z128r, X86::VFMADD213NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Z256r, X86::VFMADD213NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Zr, X86::VFMADD213NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD}, {X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD}, {X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD}, @@ -8476,6 +8811,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS}, {X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS}, {X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS}, + {X86::VFMADD231NEPBF16Z128r, X86::VFMADD231NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Z256r, X86::VFMADD231NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Zr, X86::VFMADD231NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD}, {X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD}, {X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD}, @@ -8515,6 +8853,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS}, {X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS}, {X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS}, + {X86::VFMSUB132NEPBF16Z128r, X86::VFMSUB132NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Z256r, X86::VFMSUB132NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Zr, X86::VFMSUB132NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD}, {X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD}, {X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD}, @@ -8524,6 +8865,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS}, {X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS}, {X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS}, + {X86::VFMSUB213NEPBF16Z128r, X86::VFMSUB213NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Z256r, X86::VFMSUB213NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Zr, X86::VFMSUB213NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD}, {X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD}, {X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD}, @@ -8533,6 +8877,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS}, {X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS}, {X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS}, + {X86::VFMSUB231NEPBF16Z128r, X86::VFMSUB231NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Z256r, X86::VFMSUB231NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Zr, X86::VFMSUB231NEPBF16Zmb, TB_BCAST_SH}, {X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD}, {X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD}, {X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD}, @@ -8572,6 +8919,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFMULCPHZ128rrkz, X86::VFMULCPHZ128rmbkz, TB_BCAST_SS}, {X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmbkz, TB_BCAST_SS}, {X86::VFMULCPHZrrkz, X86::VFMULCPHZrmbkz, TB_BCAST_SS}, + {X86::VFNMADD132NEPBF16Z128r, X86::VFNMADD132NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Z256r, X86::VFNMADD132NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Zr, X86::VFNMADD132NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD}, {X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD}, {X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD}, @@ -8581,6 +8931,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS}, {X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS}, {X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS}, + {X86::VFNMADD213NEPBF16Z128r, X86::VFNMADD213NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Z256r, X86::VFNMADD213NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Zr, X86::VFNMADD213NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD}, {X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD}, {X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD}, @@ -8590,6 +8943,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS}, {X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS}, {X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS}, + {X86::VFNMADD231NEPBF16Z128r, X86::VFNMADD231NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Z256r, X86::VFNMADD231NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Zr, X86::VFNMADD231NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD}, {X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD}, {X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD}, @@ -8599,6 +8955,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS}, {X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS}, {X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS}, + {X86::VFNMSUB132NEPBF16Z128r, X86::VFNMSUB132NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Z256r, X86::VFNMSUB132NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Zr, X86::VFNMSUB132NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD}, {X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD}, {X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD}, @@ -8608,6 +8967,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS}, {X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS}, {X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS}, + {X86::VFNMSUB213NEPBF16Z128r, X86::VFNMSUB213NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Z256r, X86::VFNMSUB213NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Zr, X86::VFNMSUB213NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD}, {X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD}, {X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD}, @@ -8617,6 +8979,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS}, {X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS}, {X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS}, + {X86::VFNMSUB231NEPBF16Z128r, X86::VFNMSUB231NEPBF16Z128mb, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Z256r, X86::VFNMSUB231NEPBF16Z256mb, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Zr, X86::VFNMSUB231NEPBF16Zmb, TB_BCAST_SH}, {X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD}, {X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD}, {X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD}, @@ -8626,6 +8991,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS}, {X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS}, {X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS}, + {X86::VGETEXPPBF16Z128rk, X86::VGETEXPPBF16Z128mbk, TB_BCAST_SH}, + {X86::VGETEXPPBF16Z256rk, X86::VGETEXPPBF16Z256mbk, TB_BCAST_SH}, + {X86::VGETEXPPBF16Zrk, X86::VGETEXPPBF16Zmbk, TB_BCAST_SH}, {X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mbk, TB_BCAST_SD}, {X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mbk, TB_BCAST_SD}, {X86::VGETEXPPDZrk, X86::VGETEXPPDZmbk, TB_BCAST_SD}, @@ -8635,6 +9003,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mbk, TB_BCAST_SS}, {X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mbk, TB_BCAST_SS}, {X86::VGETEXPPSZrk, X86::VGETEXPPSZmbk, TB_BCAST_SS}, + {X86::VGETMANTPBF16Z128rrik, X86::VGETMANTPBF16Z128rmbik, TB_BCAST_SH}, + {X86::VGETMANTPBF16Z256rrik, X86::VGETMANTPBF16Z256rmbik, TB_BCAST_SH}, + {X86::VGETMANTPBF16Zrrik, X86::VGETMANTPBF16Zrmbik, TB_BCAST_SH}, {X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmbik, TB_BCAST_SD}, {X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmbik, TB_BCAST_SD}, {X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmbik, TB_BCAST_SD}, @@ -8659,6 +9030,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmbkz, TB_BCAST_SS}, {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmbkz, TB_BCAST_SS}, {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmbkz, TB_BCAST_SS}, + {X86::VMAXPBF16Z128rrkz, X86::VMAXPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VMAXPBF16Z256rrkz, X86::VMAXPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VMAXPBF16Zrrkz, X86::VMAXPBF16Zrmbkz, TB_BCAST_SH}, {X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmbkz, TB_BCAST_SD}, {X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmbkz, TB_BCAST_SD}, {X86::VMAXPDZrrkz, X86::VMAXPDZrmbkz, TB_BCAST_SD}, @@ -8689,6 +9063,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmbikz, TB_BCAST_SS}, {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmbikz, TB_BCAST_SS}, {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmbikz, TB_BCAST_SS}, + {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmbkz, TB_BCAST_SH}, {X86::VMINPDZ128rrkz, X86::VMINPDZ128rmbkz, TB_BCAST_SD}, {X86::VMINPDZ256rrkz, X86::VMINPDZ256rmbkz, TB_BCAST_SD}, {X86::VMINPDZrrkz, X86::VMINPDZrmbkz, TB_BCAST_SD}, @@ -8698,6 +9075,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VMINPSZ128rrkz, X86::VMINPSZ128rmbkz, TB_BCAST_SS}, {X86::VMINPSZ256rrkz, X86::VMINPSZ256rmbkz, TB_BCAST_SS}, {X86::VMINPSZrrkz, X86::VMINPSZrmbkz, TB_BCAST_SS}, + {X86::VMULNEPBF16Z128rrkz, X86::VMULNEPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VMULNEPBF16Z256rrkz, X86::VMULNEPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VMULNEPBF16Zrrkz, X86::VMULNEPBF16Zrmbkz, TB_BCAST_SH}, {X86::VMULPDZ128rrkz, X86::VMULPDZ128rmbkz, TB_BCAST_SD}, {X86::VMULPDZ256rrkz, X86::VMULPDZ256rmbkz, TB_BCAST_SD}, {X86::VMULPDZrrkz, X86::VMULPDZrmbkz, TB_BCAST_SD}, @@ -9081,9 +9461,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VRCP14PSZrk, X86::VRCP14PSZmbk, TB_BCAST_SS}, {X86::VRCP28PDZrk, X86::VRCP28PDZmbk, TB_BCAST_SD}, {X86::VRCP28PSZrk, X86::VRCP28PSZmbk, TB_BCAST_SS}, + {X86::VRCPPBF16Z128rk, X86::VRCPPBF16Z128mbk, TB_BCAST_SH}, + {X86::VRCPPBF16Z256rk, X86::VRCPPBF16Z256mbk, TB_BCAST_SH}, + {X86::VRCPPBF16Zrk, X86::VRCPPBF16Zmbk, TB_BCAST_SH}, {X86::VRCPPHZ128rk, X86::VRCPPHZ128mbk, TB_BCAST_SH}, {X86::VRCPPHZ256rk, X86::VRCPPHZ256mbk, TB_BCAST_SH}, {X86::VRCPPHZrk, X86::VRCPPHZmbk, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z128rrik, X86::VREDUCENEPBF16Z128rmbik, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Z256rrik, X86::VREDUCENEPBF16Z256rmbik, TB_BCAST_SH}, + {X86::VREDUCENEPBF16Zrrik, X86::VREDUCENEPBF16Zrmbik, TB_BCAST_SH}, {X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmbik, TB_BCAST_SD}, {X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmbik, TB_BCAST_SD}, {X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmbik, TB_BCAST_SD}, @@ -9093,6 +9479,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmbik, TB_BCAST_SS}, {X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmbik, TB_BCAST_SS}, {X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmbik, TB_BCAST_SS}, + {X86::VRNDSCALENEPBF16Z128rrik, X86::VRNDSCALENEPBF16Z128rmbik, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Z256rrik, X86::VRNDSCALENEPBF16Z256rmbik, TB_BCAST_SH}, + {X86::VRNDSCALENEPBF16Zrrik, X86::VRNDSCALENEPBF16Zrmbik, TB_BCAST_SH}, {X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmbik, TB_BCAST_SD}, {X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmbik, TB_BCAST_SD}, {X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmbik, TB_BCAST_SD}, @@ -9110,9 +9499,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VRSQRT14PSZrk, X86::VRSQRT14PSZmbk, TB_BCAST_SS}, {X86::VRSQRT28PDZrk, X86::VRSQRT28PDZmbk, TB_BCAST_SD}, {X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmbk, TB_BCAST_SS}, + {X86::VRSQRTPBF16Z128rk, X86::VRSQRTPBF16Z128mbk, TB_BCAST_SH}, + {X86::VRSQRTPBF16Z256rk, X86::VRSQRTPBF16Z256mbk, TB_BCAST_SH}, + {X86::VRSQRTPBF16Zrk, X86::VRSQRTPBF16Zmbk, TB_BCAST_SH}, {X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mbk, TB_BCAST_SH}, {X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mbk, TB_BCAST_SH}, {X86::VRSQRTPHZrk, X86::VRSQRTPHZmbk, TB_BCAST_SH}, + {X86::VSCALEFPBF16Z128rrkz, X86::VSCALEFPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VSCALEFPBF16Z256rrkz, X86::VSCALEFPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VSCALEFPBF16Zrrkz, X86::VSCALEFPBF16Zrmbkz, TB_BCAST_SH}, {X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmbkz, TB_BCAST_SD}, {X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmbkz, TB_BCAST_SD}, {X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmbkz, TB_BCAST_SD}, @@ -9136,6 +9531,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmbikz, TB_BCAST_SS}, {X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmbikz, TB_BCAST_SS}, {X86::VSHUFPSZrrikz, X86::VSHUFPSZrmbikz, TB_BCAST_SS}, + {X86::VSQRTNEPBF16Z128rk, X86::VSQRTNEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Z256rk, X86::VSQRTNEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VSQRTNEPBF16Zrk, X86::VSQRTNEPBF16Zmbk, TB_BCAST_SH}, {X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mbk, TB_BCAST_SD}, {X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mbk, TB_BCAST_SD}, {X86::VSQRTPDZrk, X86::VSQRTPDZmbk, TB_BCAST_SD}, @@ -9145,6 +9543,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mbk, TB_BCAST_SS}, {X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mbk, TB_BCAST_SS}, {X86::VSQRTPSZrk, X86::VSQRTPSZmbk, TB_BCAST_SS}, + {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmbkz, TB_BCAST_SH}, + {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmbkz, TB_BCAST_SH}, + {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmbkz, TB_BCAST_SH}, {X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmbkz, TB_BCAST_SD}, {X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmbkz, TB_BCAST_SD}, {X86::VSUBPDZrrkz, X86::VSUBPDZrmbkz, TB_BCAST_SD}, @@ -9175,6 +9576,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { }; static const X86FoldTableEntry BroadcastTable4[] = { + {X86::VADDNEPBF16Z128rrk, X86::VADDNEPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VADDNEPBF16Z256rrk, X86::VADDNEPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VADDNEPBF16Zrrk, X86::VADDNEPBF16Zrmbk, TB_BCAST_SH}, {X86::VADDPDZ128rrk, X86::VADDPDZ128rmbk, TB_BCAST_SD}, {X86::VADDPDZ256rrk, X86::VADDPDZ256rmbk, TB_BCAST_SD}, {X86::VADDPDZrrk, X86::VADDPDZrmbk, TB_BCAST_SD}, @@ -9232,6 +9636,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmbk, TB_BCAST_SS}, + {X86::VDIVNEPBF16Z128rrk, X86::VDIVNEPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VDIVNEPBF16Z256rrk, X86::VDIVNEPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VDIVNEPBF16Zrrk, X86::VDIVNEPBF16Zrmbk, TB_BCAST_SH}, {X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmbk, TB_BCAST_SD}, {X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmbk, TB_BCAST_SD}, {X86::VDIVPDZrrk, X86::VDIVPDZrmbk, TB_BCAST_SD}, @@ -9274,6 +9681,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFIXUPIMMPSZ256rrikz, X86::VFIXUPIMMPSZ256rmbikz, TB_BCAST_SS}, {X86::VFIXUPIMMPSZrrik, X86::VFIXUPIMMPSZrmbik, TB_BCAST_SS}, {X86::VFIXUPIMMPSZrrikz, X86::VFIXUPIMMPSZrmbikz, TB_BCAST_SS}, + {X86::VFMADD132NEPBF16Z128rk, X86::VFMADD132NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Z128rkz, X86::VFMADD132NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Z256rk, X86::VFMADD132NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Z256rkz, X86::VFMADD132NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Zrk, X86::VFMADD132NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMADD132NEPBF16Zrkz, X86::VFMADD132NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mbk, TB_BCAST_SD}, {X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mbkz, TB_BCAST_SD}, {X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mbk, TB_BCAST_SD}, @@ -9292,6 +9705,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mbkz, TB_BCAST_SS}, {X86::VFMADD132PSZrk, X86::VFMADD132PSZmbk, TB_BCAST_SS}, {X86::VFMADD132PSZrkz, X86::VFMADD132PSZmbkz, TB_BCAST_SS}, + {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Z256rkz, X86::VFMADD213NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Zrk, X86::VFMADD213NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMADD213NEPBF16Zrkz, X86::VFMADD213NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mbk, TB_BCAST_SD}, {X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mbkz, TB_BCAST_SD}, {X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mbk, TB_BCAST_SD}, @@ -9310,6 +9729,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mbkz, TB_BCAST_SS}, {X86::VFMADD213PSZrk, X86::VFMADD213PSZmbk, TB_BCAST_SS}, {X86::VFMADD213PSZrkz, X86::VFMADD213PSZmbkz, TB_BCAST_SS}, + {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Z256rkz, X86::VFMADD231NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Zrk, X86::VFMADD231NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMADD231NEPBF16Zrkz, X86::VFMADD231NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mbk, TB_BCAST_SD}, {X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mbkz, TB_BCAST_SD}, {X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mbk, TB_BCAST_SD}, @@ -9388,6 +9813,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mbkz, TB_BCAST_SS}, {X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmbk, TB_BCAST_SS}, {X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmbkz, TB_BCAST_SS}, + {X86::VFMSUB132NEPBF16Z128rk, X86::VFMSUB132NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Z128rkz, X86::VFMSUB132NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Z256rk, X86::VFMSUB132NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Z256rkz, X86::VFMSUB132NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Zrk, X86::VFMSUB132NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMSUB132NEPBF16Zrkz, X86::VFMSUB132NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mbk, TB_BCAST_SD}, {X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mbkz, TB_BCAST_SD}, {X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mbk, TB_BCAST_SD}, @@ -9406,6 +9837,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mbkz, TB_BCAST_SS}, {X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmbk, TB_BCAST_SS}, {X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmbkz, TB_BCAST_SS}, + {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Z256rkz, X86::VFMSUB213NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Zrk, X86::VFMSUB213NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMSUB213NEPBF16Zrkz, X86::VFMSUB213NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mbk, TB_BCAST_SD}, {X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mbkz, TB_BCAST_SD}, {X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mbk, TB_BCAST_SD}, @@ -9424,6 +9861,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mbkz, TB_BCAST_SS}, {X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmbk, TB_BCAST_SS}, {X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmbkz, TB_BCAST_SS}, + {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Z256rkz, X86::VFMSUB231NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Zrk, X86::VFMSUB231NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFMSUB231NEPBF16Zrkz, X86::VFMSUB231NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mbk, TB_BCAST_SD}, {X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mbkz, TB_BCAST_SD}, {X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mbk, TB_BCAST_SD}, @@ -9499,6 +9942,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFMULCPHZ128rrk, X86::VFMULCPHZ128rmbk, TB_BCAST_SS}, {X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmbk, TB_BCAST_SS}, {X86::VFMULCPHZrrk, X86::VFMULCPHZrmbk, TB_BCAST_SS}, + {X86::VFNMADD132NEPBF16Z128rk, X86::VFNMADD132NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Z128rkz, X86::VFNMADD132NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Z256rk, X86::VFNMADD132NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Z256rkz, X86::VFNMADD132NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Zrk, X86::VFNMADD132NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMADD132NEPBF16Zrkz, X86::VFNMADD132NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mbk, TB_BCAST_SD}, {X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mbk, TB_BCAST_SD}, @@ -9517,6 +9966,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mbkz, TB_BCAST_SS}, {X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmbk, TB_BCAST_SS}, {X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmbkz, TB_BCAST_SS}, + {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Z256rkz, X86::VFNMADD213NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Zrk, X86::VFNMADD213NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMADD213NEPBF16Zrkz, X86::VFNMADD213NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mbk, TB_BCAST_SD}, {X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mbk, TB_BCAST_SD}, @@ -9535,6 +9990,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mbkz, TB_BCAST_SS}, {X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmbk, TB_BCAST_SS}, {X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmbkz, TB_BCAST_SS}, + {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Z256rkz, X86::VFNMADD231NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Zrk, X86::VFNMADD231NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMADD231NEPBF16Zrkz, X86::VFNMADD231NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mbk, TB_BCAST_SD}, {X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mbk, TB_BCAST_SD}, @@ -9553,6 +10014,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mbkz, TB_BCAST_SS}, {X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmbk, TB_BCAST_SS}, {X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmbkz, TB_BCAST_SS}, + {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Z256rkz, X86::VFNMSUB132NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Zrk, X86::VFNMSUB132NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMSUB132NEPBF16Zrkz, X86::VFNMSUB132NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mbk, TB_BCAST_SD}, {X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mbk, TB_BCAST_SD}, @@ -9571,6 +10038,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mbkz, TB_BCAST_SS}, {X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmbk, TB_BCAST_SS}, {X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmbkz, TB_BCAST_SS}, + {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Z256rkz, X86::VFNMSUB213NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Zrk, X86::VFNMSUB213NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMSUB213NEPBF16Zrkz, X86::VFNMSUB213NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mbk, TB_BCAST_SD}, {X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mbk, TB_BCAST_SD}, @@ -9589,6 +10062,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mbkz, TB_BCAST_SS}, {X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmbk, TB_BCAST_SS}, {X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmbkz, TB_BCAST_SS}, + {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mbk, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mbkz, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mbk, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Z256rkz, X86::VFNMSUB231NEPBF16Z256mbkz, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Zrk, X86::VFNMSUB231NEPBF16Zmbk, TB_BCAST_SH}, + {X86::VFNMSUB231NEPBF16Zrkz, X86::VFNMSUB231NEPBF16Zmbkz, TB_BCAST_SH}, {X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mbk, TB_BCAST_SD}, {X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mbkz, TB_BCAST_SD}, {X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mbk, TB_BCAST_SD}, @@ -9622,6 +10101,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmbk, TB_BCAST_SS}, {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmbk, TB_BCAST_SS}, {X86::VMAXCPSZrrk, X86::VMAXCPSZrmbk, TB_BCAST_SS}, + {X86::VMAXPBF16Z128rrk, X86::VMAXPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VMAXPBF16Z256rrk, X86::VMAXPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VMAXPBF16Zrrk, X86::VMAXPBF16Zrmbk, TB_BCAST_SH}, {X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmbk, TB_BCAST_SD}, {X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmbk, TB_BCAST_SD}, {X86::VMAXPDZrrk, X86::VMAXPDZrmbk, TB_BCAST_SD}, @@ -9652,6 +10134,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmbik, TB_BCAST_SS}, {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmbik, TB_BCAST_SS}, {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmbik, TB_BCAST_SS}, + {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmbk, TB_BCAST_SH}, {X86::VMINPDZ128rrk, X86::VMINPDZ128rmbk, TB_BCAST_SD}, {X86::VMINPDZ256rrk, X86::VMINPDZ256rmbk, TB_BCAST_SD}, {X86::VMINPDZrrk, X86::VMINPDZrmbk, TB_BCAST_SD}, @@ -9661,6 +10146,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VMINPSZ128rrk, X86::VMINPSZ128rmbk, TB_BCAST_SS}, {X86::VMINPSZ256rrk, X86::VMINPSZ256rmbk, TB_BCAST_SS}, {X86::VMINPSZrrk, X86::VMINPSZrmbk, TB_BCAST_SS}, + {X86::VMULNEPBF16Z128rrk, X86::VMULNEPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VMULNEPBF16Z256rrk, X86::VMULNEPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VMULNEPBF16Zrrk, X86::VMULNEPBF16Zrmbk, TB_BCAST_SH}, {X86::VMULPDZ128rrk, X86::VMULPDZ128rmbk, TB_BCAST_SD}, {X86::VMULPDZ256rrk, X86::VMULPDZ256rmbk, TB_BCAST_SD}, {X86::VMULPDZrrk, X86::VMULPDZrmbk, TB_BCAST_SD}, @@ -10023,6 +10511,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VRANGEPSZ128rrik, X86::VRANGEPSZ128rmbik, TB_BCAST_SS}, {X86::VRANGEPSZ256rrik, X86::VRANGEPSZ256rmbik, TB_BCAST_SS}, {X86::VRANGEPSZrrik, X86::VRANGEPSZrmbik, TB_BCAST_SS}, + {X86::VSCALEFPBF16Z128rrk, X86::VSCALEFPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VSCALEFPBF16Z256rrk, X86::VSCALEFPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VSCALEFPBF16Zrrk, X86::VSCALEFPBF16Zrmbk, TB_BCAST_SH}, {X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmbk, TB_BCAST_SD}, {X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmbk, TB_BCAST_SD}, {X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmbk, TB_BCAST_SD}, @@ -10046,6 +10537,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmbik, TB_BCAST_SS}, {X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmbik, TB_BCAST_SS}, {X86::VSHUFPSZrrik, X86::VSHUFPSZrmbik, TB_BCAST_SS}, + {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmbk, TB_BCAST_SH}, + {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmbk, TB_BCAST_SH}, + {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmbk, TB_BCAST_SH}, {X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmbk, TB_BCAST_SD}, {X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmbk, TB_BCAST_SD}, {X86::VSUBPDZrrk, X86::VSUBPDZrmbk, TB_BCAST_SD}, From 814aa432abf8e9f644903061029e6e27f6a418a8 Mon Sep 17 00:00:00 2001 From: vporpo Date: Tue, 3 Sep 2024 17:15:24 -0700 Subject: [PATCH 005/425] [SandboxIR] Implement ConstantAggregate (#107136) This patch implements sandboxir:: ConstantAggregate, ConstantStruct, ConstantArray and ConstantVector, mirroring LLVM IR. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 94 +++++++++++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 3 + llvm/include/llvm/SandboxIR/Type.h | 45 +++++++-- llvm/lib/SandboxIR/SandboxIR.cpp | 48 +++++++++- llvm/lib/SandboxIR/Type.cpp | 15 +++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 75 +++++++++++++++ llvm/unittests/SandboxIR/TypesTest.cpp | 38 ++++++++ 7 files changed, 310 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 0ac049af4db2b..5c2d58c1b99dc 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -304,6 +304,8 @@ class Value { friend class PHINode; // For getting `Val`. friend class UnreachableInst; // For getting `Val`. friend class CatchSwitchAddHandler; // For `Val`. + friend class ConstantArray; // For `Val`. + friend class ConstantStruct; // For `Val`. /// All values point to the context. Context &Ctx; @@ -840,6 +842,97 @@ class ConstantFP final : public Constant { #endif }; +/// Base class for aggregate constants (with operands). +class ConstantAggregate : public Constant { +protected: + ConstantAggregate(ClassID ID, llvm::Constant *C, Context &Ctx) + : Constant(ID, C, Ctx) {} + +public: + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + auto ID = From->getSubclassID(); + return ID == ClassID::ConstantVector || ID == ClassID::ConstantStruct || + ID == ClassID::ConstantArray; + } +}; + +class ConstantArray final : public ConstantAggregate { + ConstantArray(llvm::ConstantArray *C, Context &Ctx) + : ConstantAggregate(ClassID::ConstantArray, C, Ctx) {} + friend class Context; // For constructor. + +public: + static Constant *get(ArrayType *T, ArrayRef V); + ArrayType *getType() const; + + // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get(). + + /// For isa/dyn_cast. + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ConstantArray; + } +}; + +class ConstantStruct final : public ConstantAggregate { + ConstantStruct(llvm::ConstantStruct *C, Context &Ctx) + : ConstantAggregate(ClassID::ConstantStruct, C, Ctx) {} + friend class Context; // For constructor. + +public: + static Constant *get(StructType *T, ArrayRef V); + + template + static std::enable_if_t::value, Constant *> + get(StructType *T, Csts *...Vs) { + return get(T, ArrayRef({Vs...})); + } + /// Return an anonymous struct that has the specified elements. + /// If the struct is possibly empty, then you must specify a context. + static Constant *getAnon(ArrayRef V, bool Packed = false) { + return get(getTypeForElements(V, Packed), V); + } + static Constant *getAnon(Context &Ctx, ArrayRef V, + bool Packed = false) { + return get(getTypeForElements(Ctx, V, Packed), V); + } + /// This version of the method allows an empty list. + static StructType *getTypeForElements(Context &Ctx, ArrayRef V, + bool Packed = false); + /// Return an anonymous struct type to use for a constant with the specified + /// set of elements. The list must not be empty. + static StructType *getTypeForElements(ArrayRef V, + bool Packed = false) { + assert(!V.empty() && + "ConstantStruct::getTypeForElements cannot be called on empty list"); + return getTypeForElements(V[0]->getContext(), V, Packed); + } + + /// Specialization - reduce amount of casting. + inline StructType *getType() const { + return cast(Value::getType()); + } + + /// For isa/dyn_cast. + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ConstantStruct; + } +}; + +class ConstantVector final : public ConstantAggregate { + ConstantVector(llvm::ConstantVector *C, Context &Ctx) + : ConstantAggregate(ClassID::ConstantVector, C, Ctx) {} + friend class Context; // For constructor. + +public: + // TODO: Missing functions: getSplat(), getType(), getSplatValue(), get(). + + /// For isa/dyn_cast. + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ConstantVector; + } +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { @@ -3353,6 +3446,7 @@ class Context { friend class Type; // For LLVMCtx. friend class PointerType; // For LLVMCtx. friend class IntegerType; // For LLVMCtx. + friend class StructType; // For LLVMCtx. Tracker IRTracker; /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 2fc24ed71c4cf..d2031bbdcfb54 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -27,6 +27,9 @@ DEF_VALUE(Block, BasicBlock) DEF_CONST(Constant, Constant) DEF_CONST(ConstantInt, ConstantInt) DEF_CONST(ConstantFP, ConstantFP) +DEF_CONST(ConstantArray, ConstantArray) +DEF_CONST(ConstantStruct, ConstantStruct) +DEF_CONST(ConstantVector, ConstantVector) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 39c545a6e6c6d..2f9b94b8d7175 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -27,6 +27,8 @@ class PointerType; class VectorType; class IntegerType; class FunctionType; +class ArrayType; +class StructType; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; #include "llvm/SandboxIR/SandboxIRValues.def" @@ -36,13 +38,19 @@ class FunctionType; class Type { protected: llvm::Type *LLVMTy; - friend class VectorType; // For LLVMTy. - friend class PointerType; // For LLVMTy. - friend class FunctionType; // For LLVMTy. - friend class IntegerType; // For LLVMTy. - friend class Function; // For LLVMTy. - friend class CallBase; // For LLVMTy. - friend class ConstantInt; // For LLVMTy. + friend class ArrayType; // For LLVMTy. + friend class StructType; // For LLVMTy. + friend class VectorType; // For LLVMTy. + friend class PointerType; // For LLVMTy. + friend class FunctionType; // For LLVMTy. + friend class IntegerType; // For LLVMTy. + friend class Function; // For LLVMTy. + friend class CallBase; // For LLVMTy. + friend class ConstantInt; // For LLVMTy. + friend class ConstantArray; // For LLVMTy. + friend class ConstantStruct; // For LLVMTy. + friend class ConstantVector; // For LLVMTy. + // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; #define DEF_CONST(ID, CLASS) friend class CLASS; @@ -281,8 +289,31 @@ class PointerType : public Type { } }; +class ArrayType : public Type { +public: + // TODO: add missing functions + static bool classof(const Type *From) { + return isa(From->LLVMTy); + } +}; + +class StructType : public Type { +public: + /// This static method is the primary way to create a literal StructType. + static StructType *get(Context &Ctx, ArrayRef Elements, + bool IsPacked = false); + + bool isPacked() const { return cast(LLVMTy)->isPacked(); } + + // TODO: add missing functions + static bool classof(const Type *From) { + return isa(From->LLVMTy); + } +}; + class VectorType : public Type { public: + static VectorType *get(Type *ElementType, ElementCount EC); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 5af6fbdde42cb..e8d081e6b17e7 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2364,6 +2364,44 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) { return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V); } +Constant *ConstantArray::get(ArrayType *T, ArrayRef V) { + auto &Ctx = T->getContext(); + SmallVector LLVMValues; + LLVMValues.reserve(V.size()); + for (auto *Elm : V) + LLVMValues.push_back(cast(Elm->Val)); + auto *LLVMC = + llvm::ConstantArray::get(cast(T->LLVMTy), LLVMValues); + return cast(Ctx.getOrCreateConstant(LLVMC)); +} + +ArrayType *ConstantArray::getType() const { + return cast( + Ctx.getType(cast(Val)->getType())); +} + +Constant *ConstantStruct::get(StructType *T, ArrayRef V) { + auto &Ctx = T->getContext(); + SmallVector LLVMValues; + LLVMValues.reserve(V.size()); + for (auto *Elm : V) + LLVMValues.push_back(cast(Elm->Val)); + auto *LLVMC = + llvm::ConstantStruct::get(cast(T->LLVMTy), LLVMValues); + return cast(Ctx.getOrCreateConstant(LLVMC)); +} + +StructType *ConstantStruct::getTypeForElements(Context &Ctx, + ArrayRef V, + bool Packed) { + unsigned VecSize = V.size(); + SmallVector EltTypes; + EltTypes.reserve(VecSize); + for (Constant *Elm : V) + EltTypes.push_back(Elm->getType()); + return StructType::get(Ctx, EltTypes, Packed); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2459,7 +2497,15 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr(new ConstantFP(CF, *this)); return It->second.get(); } - if (auto *F = dyn_cast(LLVMV)) + if (auto *CA = dyn_cast(C)) + It->second = std::unique_ptr(new ConstantArray(CA, *this)); + else if (auto *CS = dyn_cast(C)) + It->second = + std::unique_ptr(new ConstantStruct(CS, *this)); + else if (auto *CV = dyn_cast(C)) + It->second = + std::unique_ptr(new ConstantVector(CV, *this)); + else if (auto *F = dyn_cast(LLVMV)) It->second = std::unique_ptr(new Function(F, *this)); else It->second = std::unique_ptr(new Constant(C, *this)); diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp index eee69c5ec7c89..535b0f75fd874 100644 --- a/llvm/lib/SandboxIR/Type.cpp +++ b/llvm/lib/SandboxIR/Type.cpp @@ -47,6 +47,21 @@ PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) { Ctx.getType(llvm::PointerType::get(Ctx.LLVMCtx, AddressSpace))); } +StructType *StructType::get(Context &Ctx, ArrayRef Elements, + bool IsPacked) { + SmallVector LLVMElements; + LLVMElements.reserve(Elements.size()); + for (Type *Elm : Elements) + LLVMElements.push_back(Elm->LLVMTy); + return cast( + Ctx.getType(llvm::StructType::get(Ctx.LLVMCtx, LLVMElements, IsPacked))); +} + +VectorType *VectorType::get(Type *ElementType, ElementCount EC) { + return cast(ElementType->getContext().getType( + llvm::VectorType::get(ElementType->LLVMTy, EC))); +} + IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) { return cast( Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits))); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 2ec8eefd8c323..ca2a183e53268 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -445,6 +445,81 @@ define void @foo(float %v0, double %v1) { EXPECT_TRUE(NegZero->isExactlyValue(-0.0)); } +// Tests ConstantArray, ConstantStruct and ConstantVector. +TEST_F(SandboxIRTest, ConstantAggregate) { + // Note: we are using i42 to avoid the creation of ConstantDataVector or + // ConstantDataArray. + parseIR(C, R"IR( +define void @foo() { + %array = extractvalue [2 x i42] [i42 0, i42 1], 0 + %struct = extractvalue {i42, i42} {i42 0, i42 1}, 0 + %vector = extractelement <2 x i42> , i32 0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = BB.begin(); + auto *I0 = &*It++; + auto *I1 = &*It++; + auto *I2 = &*It++; + // Check classof() and creation. + auto *Array = cast(I0->getOperand(0)); + EXPECT_TRUE(isa(Array)); + auto *Struct = cast(I1->getOperand(0)); + EXPECT_TRUE(isa(Struct)); + auto *Vector = cast(I2->getOperand(0)); + EXPECT_TRUE(isa(Vector)); + + auto *ZeroI42 = cast(Array->getOperand(0)); + auto *OneI42 = cast(Array->getOperand(1)); + // Check ConstantArray::get(), getType(). + auto *NewCA = + sandboxir::ConstantArray::get(Array->getType(), {ZeroI42, OneI42}); + EXPECT_EQ(NewCA, Array); + + // Check ConstantStruct::get(), getType(). + auto *NewCS = + sandboxir::ConstantStruct::get(Struct->getType(), {ZeroI42, OneI42}); + EXPECT_EQ(NewCS, Struct); + // Check ConstantStruct::get(...). + auto *NewCS2 = + sandboxir::ConstantStruct::get(Struct->getType(), ZeroI42, OneI42); + EXPECT_EQ(NewCS2, Struct); + // Check ConstantStruct::getAnon(ArayRef). + auto *AnonCS = sandboxir::ConstantStruct::getAnon({ZeroI42, OneI42}); + EXPECT_FALSE(cast(AnonCS->getType())->isPacked()); + auto *AnonCSPacked = + sandboxir::ConstantStruct::getAnon({ZeroI42, OneI42}, /*Packed=*/true); + EXPECT_TRUE(cast(AnonCSPacked->getType())->isPacked()); + // Check ConstantStruct::getAnon(Ctx, ArrayRef). + auto *AnonCS2 = sandboxir::ConstantStruct::getAnon(Ctx, {ZeroI42, OneI42}); + EXPECT_EQ(AnonCS2, AnonCS); + auto *AnonCS2Packed = sandboxir::ConstantStruct::getAnon( + Ctx, {ZeroI42, OneI42}, /*Packed=*/true); + EXPECT_EQ(AnonCS2Packed, AnonCSPacked); + // Check ConstantStruct::getTypeForElements(Ctx, ArrayRef). + auto *StructTy = + sandboxir::ConstantStruct::getTypeForElements(Ctx, {ZeroI42, OneI42}); + EXPECT_EQ(StructTy, Struct->getType()); + EXPECT_FALSE(StructTy->isPacked()); + // Check ConstantStruct::getTypeForElements(Ctx, ArrayRef, Packed). + auto *StructTyPacked = sandboxir::ConstantStruct::getTypeForElements( + Ctx, {ZeroI42, OneI42}, /*Packed=*/true); + EXPECT_TRUE(StructTyPacked->isPacked()); + // Check ConstantStruct::getTypeForElements(ArrayRef). + auto *StructTy2 = + sandboxir::ConstantStruct::getTypeForElements(Ctx, {ZeroI42, OneI42}); + EXPECT_EQ(StructTy2, Struct->getType()); + // Check ConstantStruct::getTypeForElements(ArrayRef, Packed). + auto *StructTy2Packed = sandboxir::ConstantStruct::getTypeForElements( + Ctx, {ZeroI42, OneI42}, /*Packed=*/true); + EXPECT_EQ(StructTy2Packed, StructTyPacked); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp index dcbf65a20b2fd..d4c2de441268c 100644 --- a/llvm/unittests/SandboxIR/TypesTest.cpp +++ b/llvm/unittests/SandboxIR/TypesTest.cpp @@ -224,6 +224,44 @@ define void @foo(ptr %ptr) { EXPECT_EQ(NewPtrTy2, PtrTy); } +TEST_F(SandboxTypeTest, ArrayType) { + parseIR(C, R"IR( +define void @foo([2 x i8] %v0) { + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + // Check classof(), creation. + [[maybe_unused]] auto *ArrayTy = + cast(F->getArg(0)->getType()); +} + +TEST_F(SandboxTypeTest, StructType) { + parseIR(C, R"IR( +define void @foo({i32, i8} %v0) { + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *Int32Ty = sandboxir::Type::getInt32Ty(Ctx); + auto *Int8Ty = sandboxir::Type::getInt8Ty(Ctx); + // Check classof(), creation. + [[maybe_unused]] auto *StructTy = + cast(F->getArg(0)->getType()); + // Check get(). + auto *NewStructTy = sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty}); + EXPECT_EQ(NewStructTy, StructTy); + // Check get(Packed). + auto *NewStructTyPacked = + sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty}, /*Packed=*/true); + EXPECT_NE(NewStructTyPacked, StructTy); + EXPECT_TRUE(NewStructTyPacked->isPacked()); +} + TEST_F(SandboxTypeTest, VectorType) { parseIR(C, R"IR( define void @foo(<2 x i8> %v0) { From 48bc8b0f7f49f5b23884a0d9d21056ec0bfffe24 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 4 Sep 2024 00:26:54 +0000 Subject: [PATCH 006/425] [gn build] Port 83ad644afaac --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 73cee07a8b9d7..f3553210191a4 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -134,10 +134,12 @@ copy("Headers") { "arm_cmse.h", "arm_neon_sve_bridge.h", "armintr.h", + "avx10_2_512bf16intrin.h", "avx10_2_512convertintrin.h", "avx10_2_512minmaxintrin.h", "avx10_2_512niintrin.h", "avx10_2_512satcvtintrin.h", + "avx10_2bf16intrin.h", "avx10_2convertintrin.h", "avx10_2minmaxintrin.h", "avx10_2niintrin.h", From ff0f2011e475141454028bce9cf7c6ff37a49620 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 3 Sep 2024 17:50:04 -0700 Subject: [PATCH 007/425] [RISCV] Bitcast fixed length bf16/f16 build_vector to i16 with Zvfbfmin/Zvfhmin+Zfbfmin/Zfhmin. (#106637) Previously, if Zfbfmin/Zfhmin were enabled, we only handled build_vectors that could be turned into splat_vectors. We promoted them to f32 splats by extending in the scalar domain and narrowing in the vector domain. This patch fixes a crash where we failed to account for whether the f32 vector type fit in LMUL<=8. Because the new lowering occurs after type legalization, we have to be careful to use XLenVT for the scalar integer type and use custom cast nodes. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 60 +- .../rvv/fixed-vectors-fp-buildvec-bf16.ll | 175 ++ .../RISCV/rvv/fixed-vectors-fp-buildvec.ll | 151 +- .../RISCV/rvv/fixed-vectors-fp-setcc.ll | 160 +- .../RISCV/rvv/fixed-vectors-fp-splat-bf16.ll | 45 +- .../RISCV/rvv/fixed-vectors-fp-splat.ll | 89 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 1722 +++++++++-------- .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 196 +- .../RISCV/rvv/fixed-vectors-vfadd-vp.ll | 80 +- .../RISCV/rvv/fixed-vectors-vfdiv-vp.ll | 80 +- .../RISCV/rvv/fixed-vectors-vfma-vp.ll | 104 +- .../CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll | 80 +- .../CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll | 80 +- .../RISCV/rvv/fixed-vectors-vfmul-vp.ll | 80 +- .../RISCV/rvv/fixed-vectors-vfsub-vp.ll | 80 +- .../RISCV/rvv/fixed-vectors-vpmerge-bf16.ll | 40 +- .../RISCV/rvv/fixed-vectors-vpmerge.ll | 40 +- 17 files changed, 1667 insertions(+), 1595 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 303e0a1eee9cf..11be5c25354f1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1322,6 +1322,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (VT.getVectorElementType() == MVT::f16 && !Subtarget.hasVInstructionsF16()) { + setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); setOperationAction( {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT, @@ -1331,8 +1332,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); if (Subtarget.hasStdExtZfhmin()) { - // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR. - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); } else { // We need to custom legalize f16 build vectors if Zfhmin isn't // available. @@ -1350,10 +1350,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } if (VT.getVectorElementType() == MVT::bf16) { + setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); if (Subtarget.hasStdExtZfbfmin()) { - // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR. - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); } else { // We need to custom legalize bf16 build vectors if Zfbfmin isn't // available. @@ -4120,26 +4120,45 @@ static SDValue lowerBuildVectorViaPacking(SDValue Op, SelectionDAG &DAG, DAG.getBuildVector(WideVecVT, DL, NewOperands)); } -// Convert to an vXf16 build_vector to vXi16 with bitcasts. -static SDValue lowerBUILD_VECTORvXf16(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT IVT = VT.changeVectorElementType(MVT::i16); - SmallVector NewOps(Op.getNumOperands()); - for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) - NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I)); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), IVT, NewOps); - return DAG.getBitcast(VT, Res); -} - static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert(VT.isFixedLengthVector() && "Unexpected vector!"); - // If we don't have scalar f16/bf16, we need to bitcast to an i16 vector. - if ((VT.getVectorElementType() == MVT::f16 && !Subtarget.hasStdExtZfhmin()) || - (VT.getVectorElementType() == MVT::bf16 && !Subtarget.hasStdExtZfbfmin())) - return lowerBUILD_VECTORvXf16(Op, DAG); + MVT EltVT = VT.getVectorElementType(); + MVT XLenVT = Subtarget.getXLenVT(); + + SDLoc DL(Op); + + // Proper support for f16 requires Zvfh. bf16 always requires special + // handling. We need to cast the scalar to integer and create an integer + // build_vector. + if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) || EltVT == MVT::bf16) { + MVT IVT = VT.changeVectorElementType(MVT::i16); + SmallVector NewOps(Op.getNumOperands()); + for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { + SDValue Elem = Op.getOperand(I); + if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) || + (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin())) { + // Called by LegalizeDAG, we need to use XLenVT operations since we + // can't create illegal types. + if (auto *C = dyn_cast(Elem)) { + // Manually constant fold so the integer build_vector can be lowered + // better. Waiting for DAGCombine will be too late. + APInt V = + C->getValueAPF().bitcastToAPInt().sext(XLenVT.getSizeInBits()); + NewOps[I] = DAG.getConstant(V, DL, XLenVT); + } else { + NewOps[I] = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Elem); + } + } else { + // Called by scalar type legalizer, we can use i16. + NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I)); + } + } + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, IVT, NewOps); + return DAG.getBitcast(VT, Res); + } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) @@ -4147,11 +4166,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); - SDLoc DL(Op); auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - MVT XLenVT = Subtarget.getXLenVT(); - if (VT.getVectorElementType() == MVT::i1) { // A BUILD_VECTOR can be lowered as a SETCC. For each fixed-length mask // vector type, we have a legal equivalently-sized i8 type, so we can use diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll new file mode 100644 index 0000000000000..170e71af09b49 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-NO-ZFBFMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-NO-ZFBFMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-ZFBFMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-ZFBFMIN + +define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) { +; RV32-NO-ZFBFMIN-LABEL: splat_idx_v4bf16: +; RV32-NO-ZFBFMIN: # %bb.0: +; RV32-NO-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NO-ZFBFMIN-NEXT: vrgather.vx v9, v8, a0 +; RV32-NO-ZFBFMIN-NEXT: vmv1r.v v8, v9 +; RV32-NO-ZFBFMIN-NEXT: ret +; +; RV64-NO-ZFBFMIN-LABEL: splat_idx_v4bf16: +; RV64-NO-ZFBFMIN: # %bb.0: +; RV64-NO-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NO-ZFBFMIN-NEXT: vrgather.vx v9, v8, a0 +; RV64-NO-ZFBFMIN-NEXT: vmv1r.v v8, v9 +; RV64-NO-ZFBFMIN-NEXT: ret +; +; RV32-ZFBFMIN-LABEL: splat_idx_v4bf16: +; RV32-ZFBFMIN: # %bb.0: +; RV32-ZFBFMIN-NEXT: addi sp, sp, -48 +; RV32-ZFBFMIN-NEXT: .cfi_def_cfa_offset 48 +; RV32-ZFBFMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-ZFBFMIN-NEXT: .cfi_offset ra, -4 +; RV32-ZFBFMIN-NEXT: csrr a1, vlenb +; RV32-ZFBFMIN-NEXT: slli a1, a1, 1 +; RV32-ZFBFMIN-NEXT: sub sp, sp, a1 +; RV32-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV32-ZFBFMIN-NEXT: addi a1, sp, 32 +; RV32-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV32-ZFBFMIN-NEXT: andi a0, a0, 3 +; RV32-ZFBFMIN-NEXT: li a1, 2 +; RV32-ZFBFMIN-NEXT: call __mulsi3 +; RV32-ZFBFMIN-NEXT: addi a1, sp, 16 +; RV32-ZFBFMIN-NEXT: add a0, a1, a0 +; RV32-ZFBFMIN-NEXT: addi a2, sp, 32 +; RV32-ZFBFMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; RV32-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-ZFBFMIN-NEXT: vse16.v v8, (a1) +; RV32-ZFBFMIN-NEXT: flh fa5, 0(a0) +; RV32-ZFBFMIN-NEXT: fmv.x.h a0, fa5 +; RV32-ZFBFMIN-NEXT: vmv.v.x v8, a0 +; RV32-ZFBFMIN-NEXT: csrr a0, vlenb +; RV32-ZFBFMIN-NEXT: slli a0, a0, 1 +; RV32-ZFBFMIN-NEXT: add sp, sp, a0 +; RV32-ZFBFMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-ZFBFMIN-NEXT: addi sp, sp, 48 +; RV32-ZFBFMIN-NEXT: ret +; +; RV64-ZFBFMIN-LABEL: splat_idx_v4bf16: +; RV64-ZFBFMIN: # %bb.0: +; RV64-ZFBFMIN-NEXT: addi sp, sp, -48 +; RV64-ZFBFMIN-NEXT: .cfi_def_cfa_offset 48 +; RV64-ZFBFMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-ZFBFMIN-NEXT: .cfi_offset ra, -8 +; RV64-ZFBFMIN-NEXT: csrr a1, vlenb +; RV64-ZFBFMIN-NEXT: slli a1, a1, 1 +; RV64-ZFBFMIN-NEXT: sub sp, sp, a1 +; RV64-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV64-ZFBFMIN-NEXT: addi a1, sp, 32 +; RV64-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV64-ZFBFMIN-NEXT: andi a0, a0, 3 +; RV64-ZFBFMIN-NEXT: li a1, 2 +; RV64-ZFBFMIN-NEXT: call __muldi3 +; RV64-ZFBFMIN-NEXT: addi a1, sp, 16 +; RV64-ZFBFMIN-NEXT: add a0, a1, a0 +; RV64-ZFBFMIN-NEXT: addi a2, sp, 32 +; RV64-ZFBFMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; RV64-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-ZFBFMIN-NEXT: vse16.v v8, (a1) +; RV64-ZFBFMIN-NEXT: flh fa5, 0(a0) +; RV64-ZFBFMIN-NEXT: fmv.x.h a0, fa5 +; RV64-ZFBFMIN-NEXT: vmv.v.x v8, a0 +; RV64-ZFBFMIN-NEXT: csrr a0, vlenb +; RV64-ZFBFMIN-NEXT: slli a0, a0, 1 +; RV64-ZFBFMIN-NEXT: add sp, sp, a0 +; RV64-ZFBFMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-ZFBFMIN-NEXT: addi sp, sp, 48 +; RV64-ZFBFMIN-NEXT: ret + %x = extractelement <4 x bfloat> %v, i64 %idx + %ins = insertelement <4 x bfloat> poison, bfloat %x, i32 0 + %splat = shufflevector <4 x bfloat> %ins, <4 x bfloat> poison, <4 x i32> zeroinitializer + ret <4 x bfloat> %splat +} + +define <2 x bfloat> @buildvec_v2bf16(bfloat %a, bfloat %b) { +; RV32-NO-ZFBFMIN-LABEL: buildvec_v2bf16: +; RV32-NO-ZFBFMIN: # %bb.0: +; RV32-NO-ZFBFMIN-NEXT: fmv.x.w a0, fa1 +; RV32-NO-ZFBFMIN-NEXT: fmv.x.w a1, fa0 +; RV32-NO-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NO-ZFBFMIN-NEXT: vmv.v.x v8, a1 +; RV32-NO-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NO-ZFBFMIN-NEXT: ret +; +; RV64-NO-ZFBFMIN-LABEL: buildvec_v2bf16: +; RV64-NO-ZFBFMIN: # %bb.0: +; RV64-NO-ZFBFMIN-NEXT: fmv.x.w a0, fa1 +; RV64-NO-ZFBFMIN-NEXT: fmv.x.w a1, fa0 +; RV64-NO-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NO-ZFBFMIN-NEXT: vmv.v.x v8, a1 +; RV64-NO-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NO-ZFBFMIN-NEXT: ret +; +; RV32-ZFBFMIN-LABEL: buildvec_v2bf16: +; RV32-ZFBFMIN: # %bb.0: +; RV32-ZFBFMIN-NEXT: fmv.x.h a0, fa1 +; RV32-ZFBFMIN-NEXT: fmv.x.h a1, fa0 +; RV32-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-ZFBFMIN-NEXT: vmv.v.x v8, a1 +; RV32-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ZFBFMIN-NEXT: ret +; +; RV64-ZFBFMIN-LABEL: buildvec_v2bf16: +; RV64-ZFBFMIN: # %bb.0: +; RV64-ZFBFMIN-NEXT: fmv.x.h a0, fa1 +; RV64-ZFBFMIN-NEXT: fmv.x.h a1, fa0 +; RV64-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-ZFBFMIN-NEXT: vmv.v.x v8, a1 +; RV64-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV64-ZFBFMIN-NEXT: ret + %v1 = insertelement <2 x bfloat> poison, bfloat %a, i64 0 + %v2 = insertelement <2 x bfloat> %v1, bfloat %b, i64 1 + ret <2 x bfloat> %v2 +} + +define <2 x bfloat> @vid_v2bf16() { +; CHECK-LABEL: vid_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 260096 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: ret + ret <2 x bfloat> +} + +define <2 x bfloat> @vid_addend1_v2bf16() { +; CHECK-LABEL: vid_addend1_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 262148 +; CHECK-NEXT: addi a0, a0, -128 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: ret + ret <2 x bfloat> +} + +define <2 x bfloat> @vid_denominator2_v2bf16() { +; CHECK-LABEL: vid_denominator2_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 260100 +; CHECK-NEXT: addi a0, a0, -256 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: ret + ret <2 x bfloat> +} + +define <2 x bfloat> @vid_step2_v2bf16() { +; CHECK-LABEL: vid_step2_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsll.vi v8, v8, 14 +; CHECK-NEXT: ret + ret <2 x bfloat> +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV32ZVFBFMIN: {{.*}} +; RV64: {{.*}} +; RV64ZVFBFMIN: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 26ed4595ca758..e3aabb5de29c2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -4,8 +4,10 @@ ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RV64V ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RVA22U64 -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-NO-ZFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-NO-ZFHMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-ZFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-ZFHMIN ; Tests that a floating-point build_vector doesn't try and generate a VID ; instruction @@ -169,12 +171,95 @@ define <4 x half> @splat_c3_v4f16(<4 x half> %v) { } define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) { -; CHECK-LABEL: splat_idx_v4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vrgather.vx v9, v8, a0 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: splat_idx_v4f16: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32ZVFH-NEXT: vrgather.vx v9, v8, a0 +; RV32ZVFH-NEXT: vmv1r.v v8, v9 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: splat_idx_v4f16: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVFH-NEXT: vrgather.vx v9, v8, a0 +; RV64ZVFH-NEXT: vmv1r.v v8, v9 +; RV64ZVFH-NEXT: ret +; +; RV32-NO-ZFHMIN-LABEL: splat_idx_v4f16: +; RV32-NO-ZFHMIN: # %bb.0: +; RV32-NO-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NO-ZFHMIN-NEXT: vrgather.vx v9, v8, a0 +; RV32-NO-ZFHMIN-NEXT: vmv1r.v v8, v9 +; RV32-NO-ZFHMIN-NEXT: ret +; +; RV64-NO-ZFHMIN-LABEL: splat_idx_v4f16: +; RV64-NO-ZFHMIN: # %bb.0: +; RV64-NO-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NO-ZFHMIN-NEXT: vrgather.vx v9, v8, a0 +; RV64-NO-ZFHMIN-NEXT: vmv1r.v v8, v9 +; RV64-NO-ZFHMIN-NEXT: ret +; +; RV32-ZFHMIN-LABEL: splat_idx_v4f16: +; RV32-ZFHMIN: # %bb.0: +; RV32-ZFHMIN-NEXT: addi sp, sp, -48 +; RV32-ZFHMIN-NEXT: .cfi_def_cfa_offset 48 +; RV32-ZFHMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-ZFHMIN-NEXT: .cfi_offset ra, -4 +; RV32-ZFHMIN-NEXT: csrr a1, vlenb +; RV32-ZFHMIN-NEXT: slli a1, a1, 1 +; RV32-ZFHMIN-NEXT: sub sp, sp, a1 +; RV32-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV32-ZFHMIN-NEXT: addi a1, sp, 32 +; RV32-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV32-ZFHMIN-NEXT: andi a0, a0, 3 +; RV32-ZFHMIN-NEXT: li a1, 2 +; RV32-ZFHMIN-NEXT: call __mulsi3 +; RV32-ZFHMIN-NEXT: addi a1, sp, 16 +; RV32-ZFHMIN-NEXT: add a0, a1, a0 +; RV32-ZFHMIN-NEXT: addi a2, sp, 32 +; RV32-ZFHMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; RV32-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-ZFHMIN-NEXT: vse16.v v8, (a1) +; RV32-ZFHMIN-NEXT: flh fa5, 0(a0) +; RV32-ZFHMIN-NEXT: fmv.x.h a0, fa5 +; RV32-ZFHMIN-NEXT: vmv.v.x v8, a0 +; RV32-ZFHMIN-NEXT: csrr a0, vlenb +; RV32-ZFHMIN-NEXT: slli a0, a0, 1 +; RV32-ZFHMIN-NEXT: add sp, sp, a0 +; RV32-ZFHMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-ZFHMIN-NEXT: addi sp, sp, 48 +; RV32-ZFHMIN-NEXT: ret +; +; RV64-ZFHMIN-LABEL: splat_idx_v4f16: +; RV64-ZFHMIN: # %bb.0: +; RV64-ZFHMIN-NEXT: addi sp, sp, -48 +; RV64-ZFHMIN-NEXT: .cfi_def_cfa_offset 48 +; RV64-ZFHMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-ZFHMIN-NEXT: .cfi_offset ra, -8 +; RV64-ZFHMIN-NEXT: csrr a1, vlenb +; RV64-ZFHMIN-NEXT: slli a1, a1, 1 +; RV64-ZFHMIN-NEXT: sub sp, sp, a1 +; RV64-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV64-ZFHMIN-NEXT: addi a1, sp, 32 +; RV64-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV64-ZFHMIN-NEXT: andi a0, a0, 3 +; RV64-ZFHMIN-NEXT: li a1, 2 +; RV64-ZFHMIN-NEXT: call __muldi3 +; RV64-ZFHMIN-NEXT: addi a1, sp, 16 +; RV64-ZFHMIN-NEXT: add a0, a1, a0 +; RV64-ZFHMIN-NEXT: addi a2, sp, 32 +; RV64-ZFHMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; RV64-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-ZFHMIN-NEXT: vse16.v v8, (a1) +; RV64-ZFHMIN-NEXT: flh fa5, 0(a0) +; RV64-ZFHMIN-NEXT: fmv.x.h a0, fa5 +; RV64-ZFHMIN-NEXT: vmv.v.x v8, a0 +; RV64-ZFHMIN-NEXT: csrr a0, vlenb +; RV64-ZFHMIN-NEXT: slli a0, a0, 1 +; RV64-ZFHMIN-NEXT: add sp, sp, a0 +; RV64-ZFHMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-ZFHMIN-NEXT: addi sp, sp, 48 +; RV64-ZFHMIN-NEXT: ret %x = extractelement <4 x half> %v, i64 %idx %ins = insertelement <4 x half> poison, half %x, i32 0 %splat = shufflevector <4 x half> %ins, <4 x half> poison, <4 x i32> zeroinitializer @@ -295,23 +380,41 @@ define <2 x half> @buildvec_v2f16(half %a, half %b) { ; RV64ZVFH-NEXT: vfslide1down.vf v8, v8, fa1 ; RV64ZVFH-NEXT: ret ; -; RV32ZVFHMIN-LABEL: buildvec_v2f16: -; RV32ZVFHMIN: # %bb.0: -; RV32ZVFHMIN-NEXT: fmv.x.w a0, fa1 -; RV32ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV32ZVFHMIN-NEXT: vmv.v.x v8, a1 -; RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 -; RV32ZVFHMIN-NEXT: ret +; RV32-NO-ZFHMIN-LABEL: buildvec_v2f16: +; RV32-NO-ZFHMIN: # %bb.0: +; RV32-NO-ZFHMIN-NEXT: fmv.x.w a0, fa1 +; RV32-NO-ZFHMIN-NEXT: fmv.x.w a1, fa0 +; RV32-NO-ZFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NO-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV32-NO-ZFHMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NO-ZFHMIN-NEXT: ret ; -; RV64ZVFHMIN-LABEL: buildvec_v2f16: -; RV64ZVFHMIN: # %bb.0: -; RV64ZVFHMIN-NEXT: fmv.x.w a0, fa1 -; RV64ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV64ZVFHMIN-NEXT: vmv.v.x v8, a1 -; RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVFHMIN-NEXT: ret +; RV64-NO-ZFHMIN-LABEL: buildvec_v2f16: +; RV64-NO-ZFHMIN: # %bb.0: +; RV64-NO-ZFHMIN-NEXT: fmv.x.w a0, fa1 +; RV64-NO-ZFHMIN-NEXT: fmv.x.w a1, fa0 +; RV64-NO-ZFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NO-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64-NO-ZFHMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NO-ZFHMIN-NEXT: ret +; +; RV32-ZFHMIN-LABEL: buildvec_v2f16: +; RV32-ZFHMIN: # %bb.0: +; RV32-ZFHMIN-NEXT: fmv.x.h a0, fa1 +; RV32-ZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV32-ZFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV32-ZFHMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ZFHMIN-NEXT: ret +; +; RV64-ZFHMIN-LABEL: buildvec_v2f16: +; RV64-ZFHMIN: # %bb.0: +; RV64-ZFHMIN-NEXT: fmv.x.h a0, fa1 +; RV64-ZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64-ZFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64-ZFHMIN-NEXT: vslide1down.vx v8, v8, a0 +; RV64-ZFHMIN-NEXT: ret %v1 = insertelement <2 x half> poison, half %a, i64 0 %v2 = insertelement <2 x half> %v1, half %b, i64 1 ret <2 x half> %v2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll index a566fab1596f6..31e2d75e514b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -648,12 +648,8 @@ define void @fcmp_oeq_vf_v8f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -681,12 +677,8 @@ define void @fcmp_oeq_vf_v8f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -814,12 +806,8 @@ define void @fcmp_olt_vf_v16f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -847,12 +835,8 @@ define void @fcmp_olt_vf_v16f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -965,12 +949,8 @@ define void @fcmp_ule_vf_v32f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: li a2, 32 ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v12, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -1001,12 +981,8 @@ define void @fcmp_ule_vf_v32f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: li a2, 32 ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v12, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -1217,19 +1193,15 @@ define void @fcmp_ord_vf_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 ; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1275,19 +1247,15 @@ define void @fcmp_uno_vf_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v9, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 ; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1321,12 +1289,8 @@ define void @fcmp_oeq_fv_v8f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -1354,12 +1318,8 @@ define void @fcmp_oeq_fv_v8f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -1487,12 +1447,8 @@ define void @fcmp_olt_fv_v16f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -1520,12 +1476,8 @@ define void @fcmp_olt_fv_v16f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -1638,12 +1590,8 @@ define void @fcmp_ule_fv_v32f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: li a2, 32 ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v12, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -1674,12 +1622,8 @@ define void @fcmp_ule_fv_v32f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: li a2, 32 ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v12, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -1890,19 +1834,15 @@ define void @fcmp_ord_fv_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 ; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1948,19 +1888,15 @@ define void @fcmp_uno_fv_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v9, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 ; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll index b1250f4804549..c94cdadc8ca59 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll @@ -7,11 +7,9 @@ define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_v8bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: -; ZFBFMIN-ZVFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZFBFMIN-ZVFBFMIN-NEXT: vfmv.v.f v10, fa5 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZFBFMIN-ZVFBFMIN-NEXT: vfncvtbf16.f.f.w v8, v10 +; ZFBFMIN-ZVFBFMIN-NEXT: fmv.x.h a0, fa0 +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.x v8, a0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; ; ZVFBFMIN-LABEL: splat_v8bf16: @@ -28,11 +26,9 @@ define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) { define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_16bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: -; ZFBFMIN-ZVFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZFBFMIN-ZVFBFMIN-NEXT: vfmv.v.f v12, fa5 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZFBFMIN-ZVFBFMIN-NEXT: vfncvtbf16.f.f.w v8, v12 +; ZFBFMIN-ZVFBFMIN-NEXT: fmv.x.h a0, fa0 +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.x v8, a0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; ; ZVFBFMIN-LABEL: splat_16bf16: @@ -46,10 +42,31 @@ define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) { ret <16 x bfloat> %b } +define <64 x bfloat> @splat_64bf16(ptr %x, bfloat %y) { +; ZFBFMIN-ZVFBFMIN-LABEL: splat_64bf16: +; ZFBFMIN-ZVFBFMIN: # %bb.0: +; ZFBFMIN-ZVFBFMIN-NEXT: fmv.x.h a0, fa0 +; ZFBFMIN-ZVFBFMIN-NEXT: li a1, 64 +; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.x v8, a0 +; ZFBFMIN-ZVFBFMIN-NEXT: ret +; +; ZVFBFMIN-LABEL: splat_64bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: li a1, 64 +; ZVFBFMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v8, a0 +; ZVFBFMIN-NEXT: ret + %a = insertelement <64 x bfloat> poison, bfloat %y, i32 0 + %b = shufflevector <64 x bfloat> %a, <64 x bfloat> poison, <64 x i32> zeroinitializer + ret <64 x bfloat> %b +} + define <8 x bfloat> @splat_zero_v8bf16(ptr %x) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_v8bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.i v8, 0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; @@ -64,7 +81,7 @@ define <8 x bfloat> @splat_zero_v8bf16(ptr %x) { define <16 x bfloat> @splat_zero_16bf16(ptr %x) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_16bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.i v8, 0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; @@ -80,7 +97,7 @@ define <8 x bfloat> @splat_negzero_v8bf16(ptr %x) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_v8bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: ; ZFBFMIN-ZVFBFMIN-NEXT: lui a0, 1048568 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.x v8, a0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; @@ -97,7 +114,7 @@ define <16 x bfloat> @splat_negzero_16bf16(ptr %x) { ; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_16bf16: ; ZFBFMIN-ZVFBFMIN: # %bb.0: ; ZFBFMIN-ZVFBFMIN-NEXT: lui a0, 1048568 -; ZFBFMIN-ZVFBFMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZFBFMIN-ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZFBFMIN-ZVFBFMIN-NEXT: vmv.v.x v8, a0 ; ZFBFMIN-ZVFBFMIN-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll index 10103813d526c..250b3e90cbbb6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFH -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64-ZVFHMIN-NOZFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64_ZVFHMIN-ZFHMIN define void @splat_v8f16(ptr %x, half %y) { ; CHECK-RV32-LABEL: splat_v8f16: @@ -18,13 +19,21 @@ define void @splat_v8f16(ptr %x, half %y) { ; RV64-ZVFH-NEXT: vse16.v v8, (a0) ; RV64-ZVFH-NEXT: ret ; -; RV64-ZVFHMIN-LABEL: splat_v8f16: -; RV64-ZVFHMIN: # %bb.0: -; RV64-ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; RV64-ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-ZVFHMIN-NEXT: vmv.v.x v8, a1 -; RV64-ZVFHMIN-NEXT: vse16.v v8, (a0) -; RV64-ZVFHMIN-NEXT: ret +; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_v8f16: +; RV64-ZVFHMIN-NOZFHMIN: # %bb.0: +; RV64-ZVFHMIN-NOZFHMIN-NEXT: fmv.x.w a1, fa0 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vse16.v v8, (a0) +; RV64-ZVFHMIN-NOZFHMIN-NEXT: ret +; +; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_v8f16: +; RV64_ZVFHMIN-ZFHMIN: # %bb.0: +; RV64_ZVFHMIN-ZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64_ZVFHMIN-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vse16.v v8, (a0) +; RV64_ZVFHMIN-ZFHMIN-NEXT: ret %a = insertelement <8 x half> poison, half %y, i32 0 %b = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer store <8 x half> %b, ptr %x @@ -72,13 +81,21 @@ define void @splat_16f16(ptr %x, half %y) { ; RV64-ZVFH-NEXT: vse16.v v8, (a0) ; RV64-ZVFH-NEXT: ret ; -; RV64-ZVFHMIN-LABEL: splat_16f16: -; RV64-ZVFHMIN: # %bb.0: -; RV64-ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; RV64-ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-ZVFHMIN-NEXT: vmv.v.x v8, a1 -; RV64-ZVFHMIN-NEXT: vse16.v v8, (a0) -; RV64-ZVFHMIN-NEXT: ret +; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_16f16: +; RV64-ZVFHMIN-NOZFHMIN: # %bb.0: +; RV64-ZVFHMIN-NOZFHMIN-NEXT: fmv.x.w a1, fa0 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vse16.v v8, (a0) +; RV64-ZVFHMIN-NOZFHMIN-NEXT: ret +; +; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_16f16: +; RV64_ZVFHMIN-ZFHMIN: # %bb.0: +; RV64_ZVFHMIN-ZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64_ZVFHMIN-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vse16.v v8, (a0) +; RV64_ZVFHMIN-ZFHMIN-NEXT: ret %a = insertelement <16 x half> poison, half %y, i32 0 %b = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> zeroinitializer store <16 x half> %b, ptr %x @@ -111,6 +128,46 @@ define void @splat_v4f64(ptr %x, double %y) { ret void } +define void @splat_64f16(ptr %x, half %y) { +; CHECK-RV32-LABEL: splat_64f16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV32-NEXT: vfmv.v.f v8, fa0 +; CHECK-RV32-NEXT: vse16.v v8, (a0) +; CHECK-RV32-NEXT: ret +; +; RV64-ZVFH-LABEL: splat_64f16: +; RV64-ZVFH: # %bb.0: +; RV64-ZVFH-NEXT: li a1, 64 +; RV64-ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-ZVFH-NEXT: vfmv.v.f v8, fa0 +; RV64-ZVFH-NEXT: vse16.v v8, (a0) +; RV64-ZVFH-NEXT: ret +; +; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_64f16: +; RV64-ZVFHMIN-NOZFHMIN: # %bb.0: +; RV64-ZVFHMIN-NOZFHMIN-NEXT: fmv.x.w a1, fa0 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: li a2, 64 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64-ZVFHMIN-NOZFHMIN-NEXT: vse16.v v8, (a0) +; RV64-ZVFHMIN-NOZFHMIN-NEXT: ret +; +; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_64f16: +; RV64_ZVFHMIN-ZFHMIN: # %bb.0: +; RV64_ZVFHMIN-ZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64_ZVFHMIN-ZFHMIN-NEXT: li a2, 64 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64_ZVFHMIN-ZFHMIN-NEXT: vmv.v.x v8, a1 +; RV64_ZVFHMIN-ZFHMIN-NEXT: vse16.v v8, (a0) +; RV64_ZVFHMIN-ZFHMIN-NEXT: ret + %a = insertelement <64 x half> poison, half %y, i32 0 + %b = shufflevector <64 x half> %a, <64 x half> poison, <64 x i32> zeroinitializer + store <64 x half> %b, ptr %x + ret void +} + define void @splat_zero_v8f16(ptr %x) { ; CHECK-LABEL: splat_zero_v8f16: ; CHECK: # %bb.0: @@ -268,3 +325,5 @@ define void @splat_negzero_v4f64(ptr %x) { store <4 x double> splat (double -0.0), ptr %x ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV64-ZVFHMIN: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index cdbca0b874e60..1843157573257 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -868,16 +868,12 @@ define void @copysign_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8 +; ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -903,16 +899,16 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v9, v8 +; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -927,16 +923,16 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v9, v8 +; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -1225,10 +1221,6 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; ZVFHMIN-RV64-NEXT: vle64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a2, sp -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a2) -; ZVFHMIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a2) ; ZVFHMIN-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle32.v v9, (a1) ; ZVFHMIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma @@ -2193,16 +2185,12 @@ define void @fadd_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v8, v9, v8 +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2228,16 +2216,16 @@ define void @fadd_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v9, v8 +; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2252,16 +2240,16 @@ define void @fadd_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v9, v8 +; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2331,16 +2319,12 @@ define void @fadd_fv_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v8, v8, v9 +; ZVFHMIN-NEXT: vfadd.vv v8, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2366,16 +2350,16 @@ define void @fadd_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v8, v9 +; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v10, v9 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2390,16 +2374,16 @@ define void @fadd_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v8, v9 +; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v10, v9 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2469,16 +2453,12 @@ define void @fsub_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v8, v9, v8 +; ZVFHMIN-NEXT: vfsub.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2504,16 +2484,16 @@ define void @fsub_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v9, v8 +; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2528,16 +2508,16 @@ define void @fsub_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v9, v8 +; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2607,16 +2587,12 @@ define void @fsub_fv_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v8, v8, v9 +; ZVFHMIN-NEXT: vfsub.vv v8, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2642,16 +2618,16 @@ define void @fsub_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v8, v9 +; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v10, v9 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2666,16 +2642,16 @@ define void @fsub_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v8, v9 +; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v10, v9 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2745,16 +2721,12 @@ define void @fmul_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v8, v9, v8 +; ZVFHMIN-NEXT: vfmul.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2780,16 +2752,16 @@ define void @fmul_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v9, v8 +; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2804,16 +2776,16 @@ define void @fmul_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v9, v8 +; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2883,16 +2855,12 @@ define void @fmul_fv_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v8, v8, v9 +; ZVFHMIN-NEXT: vfmul.vv v8, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -2918,16 +2886,16 @@ define void @fmul_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v8, v9 +; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v10, v9 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2942,16 +2910,16 @@ define void @fmul_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v8, v9 +; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v10, v9 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -3021,16 +2989,12 @@ define void @fdiv_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v8, v9, v8 +; ZVFHMIN-NEXT: vfdiv.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -3056,16 +3020,16 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v9, v8 +; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -3080,16 +3044,16 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v9, v8 +; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -3159,16 +3123,12 @@ define void @fdiv_fv_v8f16(ptr %x, half %y) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v8, v8, v9 +; ZVFHMIN-NEXT: vfdiv.vv v8, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) @@ -3194,16 +3154,16 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v8, v9 +; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v10, v9 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -3218,16 +3178,16 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v8, v9 +; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v10, v9 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -3297,22 +3257,18 @@ define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-LABEL: fma_vf_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vle16.v v9, (a1) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -3338,25 +3294,25 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 ; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fma_vf_v6f16: @@ -3364,22 +3320,22 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) ; ZVFHMIN-RV64-NEXT: ret @@ -3450,22 +3406,18 @@ define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-LABEL: fma_fv_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vle16.v v9, (a1) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -3491,25 +3443,25 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 ; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fma_fv_v6f16: @@ -3517,22 +3469,22 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v8, v9, v10 +; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) ; ZVFHMIN-RV64-NEXT: ret @@ -3603,26 +3555,22 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-LABEL: fmsub_vf_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vle16.v v9, (a1) -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v10 +; ZVFHMIN-NEXT: vfneg.v v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 +; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmacc.vv v11, v9, v8 +; ZVFHMIN-NEXT: vfmacc.vv v10, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x @@ -3650,30 +3598,30 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: li a2, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v9, v10 +; ZVFHMIN-RV32-NEXT: vfneg.v v9, v11 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v9 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmacc.vv v11, v9, v8 +; ZVFHMIN-RV32-NEXT: vfmacc.vv v8, v9, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) +; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16: @@ -3681,27 +3629,27 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v10 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: li a2, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v9, v10 +; ZVFHMIN-RV64-NEXT: vfneg.v v9, v11 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v9 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmacc.vv v11, v9, v8 +; ZVFHMIN-RV64-NEXT: vfmacc.vv v8, v9, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 +; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) ; ZVFHMIN-RV64-NEXT: ret @@ -3829,13 +3777,13 @@ define void @trunc_v8f16(ptr %x) { ; ; ZVFHMIN-ZFH-RV32-LABEL: trunc_v8f16: ; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI115_0) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI115_0)(a1) ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 @@ -3846,92 +3794,101 @@ define void @trunc_v8f16(ptr %x) { ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_2: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_4 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_4: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_6 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_6: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_8 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_8: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_10 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_10 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_10: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_12 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_12 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_12: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_14 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_14 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_14: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_16 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB115_16 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15: ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_16: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; ; ZVFHMIN-ZFH-RV64-LABEL: trunc_v8f16: ; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI115_0) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI115_0)(a1) ; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 @@ -3942,92 +3899,101 @@ define void @trunc_v8f16(ptr %x) { ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_2: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_4 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_4: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_6 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_6: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_8 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_8: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_10 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_10 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_10: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_12 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_12 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_12: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_14 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_14 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_14: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_16 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB115_16 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15: ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_16: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v8f16: ; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1 @@ -4039,107 +4005,116 @@ define void @trunc_v8f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_2: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_4 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_4: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_6 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_6: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_8 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7: ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_8: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_10 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_10 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_10: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_12 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_12 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_12: +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_14 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_14 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_14: +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_16 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_16 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15: ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_16: ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v8f16: ; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1 @@ -4151,96 +4126,105 @@ define void @trunc_v8f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_2: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_4 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_4: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_6 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_6: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_8 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_8: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_10 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_10 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_10: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_12 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_12 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_12: +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_14 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_14 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_14: +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_16 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_16 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_16: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <8 x half>, ptr %x %b = call <8 x half> @llvm.trunc.v8f16(<8 x half> %a) @@ -4268,13 +4252,13 @@ define void @trunc_v6f16(ptr %x) { ; ; ZVFHMIN-ZFH-RV32-LABEL: trunc_v6f16: ; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -48 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI116_0) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI116_0)(a1) ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 @@ -4285,103 +4269,111 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_2: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 46(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_4 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_4: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 44(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_6 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_6: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 42(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_8 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa0, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB116_8 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_8: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 40(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_10 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_10 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa1, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_10: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 38(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_12 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a4, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a4, .LBB116_12 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a4, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa3, 36(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_14 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a6, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a6, .LBB116_14 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa1, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a5, fa2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a5, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_14: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa2, 34(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_16 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a6, fa3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a6, .LBB116_16 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa5, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a5, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a5, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_16: -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa1, 32(sp) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 32 -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa3, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa2, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa1, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 48 +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; ; ZVFHMIN-ZFH-RV64-LABEL: trunc_v6f16: ; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI116_0) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI116_0)(a1) ; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 @@ -4392,96 +4384,105 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_2: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_4 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_4: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_6 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_6: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_8 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_8: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_10 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_10 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_10: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_12 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_12 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_12: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_14 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_14 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_14: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 ; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_16 +; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB116_16 ; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15: ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz ; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_16: -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa4, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v6f16: ; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -48 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1 @@ -4493,118 +4494,126 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_2: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 46(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_4 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_4: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 44(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_6 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_6: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 42(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_8 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_8: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 40(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_10 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_10 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a3, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_10: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 38(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa2, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa2, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_12 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a4, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a4, .LBB116_12 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a4, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa3, 36(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_14 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a6, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a6, .LBB116_14 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a5, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a5, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_14: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa2, 34(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa1, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_16 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a6, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a6, .LBB116_16 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa1, fa5, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a5, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a5, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_16: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 32(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa3, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa2, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 48 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v6f16: ; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1 @@ -4616,100 +4625,109 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_2: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_4 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_4: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_6 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_6: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1 ; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_8 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_8: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_10 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_10 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_10: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_12 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_12 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_12: +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_14 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_14 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_14: +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa4, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_16 +; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_16 ; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz ; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_16: ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 ; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.trunc.v6f16(<6 x half> %a) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 33e9cde4c30ab..8e2a225622eec 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -64,12 +64,9 @@ define <8 x i1> @fcmp_oeq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_oeq_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -91,12 +88,9 @@ define <8 x i1> @fcmp_oeq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -138,12 +132,9 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ogt_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -165,12 +156,9 @@ define <8 x i1> @fcmp_ogt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -212,12 +200,9 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_oge_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -239,12 +224,9 @@ define <8 x i1> @fcmp_oge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -286,12 +268,9 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_olt_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -313,12 +292,9 @@ define <8 x i1> @fcmp_olt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -360,12 +336,9 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ole_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -387,12 +360,9 @@ define <8 x i1> @fcmp_ole_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -439,12 +409,9 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_one_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -469,12 +436,9 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_one_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -526,12 +490,9 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ord_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t @@ -560,12 +521,9 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t @@ -615,12 +573,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ueq_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -645,12 +600,9 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -695,12 +647,9 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ugt_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -723,12 +672,9 @@ define <8 x i1> @fcmp_ugt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -772,12 +718,9 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_uge_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -800,12 +743,9 @@ define <8 x i1> @fcmp_uge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -849,12 +789,9 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ult_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -877,12 +814,9 @@ define <8 x i1> @fcmp_ult_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -926,12 +860,9 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_ule_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -954,12 +885,9 @@ define <8 x i1> @fcmp_ule_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -1001,12 +929,9 @@ define <8 x i1> @fcmp_une_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_une_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -1028,12 +953,9 @@ define <8 x i1> @fcmp_une_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_une_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -1084,12 +1006,9 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ; ZVFHMIN-LABEL: fcmp_uno_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t @@ -1118,12 +1037,9 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll index f023c760f14a7..7a7236235d120 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll @@ -61,16 +61,13 @@ define <2 x half> @vfadd_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfadd_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -89,16 +86,13 @@ define <2 x half> @vfadd_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfadd_vf_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -184,16 +178,13 @@ define <4 x half> @vfadd_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfadd_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -212,16 +203,13 @@ define <4 x half> @vfadd_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfadd_vf_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -284,12 +272,9 @@ define <8 x half> @vfadd_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfadd_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -312,12 +297,9 @@ define <8 x half> @vfadd_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfadd_vf_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -384,12 +366,9 @@ define <16 x half> @vfadd_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32 ; ; ZVFHMIN-LABEL: vfadd_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma @@ -412,12 +391,9 @@ define <16 x half> @vfadd_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe ; ; ZVFHMIN-LABEL: vfadd_vf_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll index 9fb8377d5a5ef..cb83e5ff4f2b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll @@ -61,16 +61,13 @@ define <2 x half> @vfdiv_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfdiv_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfdiv.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -89,16 +86,13 @@ define <2 x half> @vfdiv_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfdiv_vf_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfdiv.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -184,16 +178,13 @@ define <4 x half> @vfdiv_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfdiv_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfdiv.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -212,16 +203,13 @@ define <4 x half> @vfdiv_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfdiv_vf_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfdiv.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -284,12 +272,9 @@ define <8 x half> @vfdiv_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfdiv_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -312,12 +297,9 @@ define <8 x half> @vfdiv_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfdiv_vf_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -384,12 +366,9 @@ define <16 x half> @vfdiv_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32 ; ; ZVFHMIN-LABEL: vfdiv_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma @@ -412,12 +391,9 @@ define <16 x half> @vfdiv_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe ; ; ZVFHMIN-LABEL: vfdiv_vf_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll index e2e48cee3eacc..6dcebc9763d82 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -64,17 +64,14 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i ; ; ZVFHMIN-LABEL: vfma_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -93,17 +90,14 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v ; ; ZVFHMIN-LABEL: vfma_vf_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v10 +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -169,17 +163,14 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i ; ; ZVFHMIN-LABEL: vfma_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -198,17 +189,14 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v ; ; ZVFHMIN-LABEL: vfma_vf_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v10 +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -274,17 +262,14 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i ; ; ZVFHMIN-LABEL: vfma_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -303,17 +288,14 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v ; ; ZVFHMIN-LABEL: vfma_vf_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v10 +; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -379,17 +361,14 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1 ; ; ZVFHMIN-LABEL: vfma_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret @@ -408,17 +387,14 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half ; ; ZVFHMIN-LABEL: vfma_vf_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v12 +; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll index a9d6b5f047ebb..11420a23285d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll @@ -40,16 +40,13 @@ define <2 x half> @vfmax_v2f16_vf(<2 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v2f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -68,16 +65,13 @@ define <2 x half> @vfmax_v2f16_fv(<2 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v2f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -119,16 +113,13 @@ define <4 x half> @vfmax_v4f16_vf(<4 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v4f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -147,16 +138,13 @@ define <4 x half> @vfmax_v4f16_fv(<4 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v4f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -198,12 +186,9 @@ define <8 x half> @vfmax_v8f16_vf(<8 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v8f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -226,12 +211,9 @@ define <8 x half> @vfmax_v8f16_fv(<8 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v8f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -277,12 +259,9 @@ define <16 x half> @vfmax_v16f16_vf(<16 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v16f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -305,12 +284,9 @@ define <16 x half> @vfmax_v16f16_fv(<16 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmax_v16f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll index d7c6fb3568f66..e8ae32a45f7cd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll @@ -40,16 +40,13 @@ define <2 x half> @vfmin_v2f16_vf(<2 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v2f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -68,16 +65,13 @@ define <2 x half> @vfmin_v2f16_fv(<2 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v2f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -119,16 +113,13 @@ define <4 x half> @vfmin_v4f16_vf(<4 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v4f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -147,16 +138,13 @@ define <4 x half> @vfmin_v4f16_fv(<4 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v4f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -198,12 +186,9 @@ define <8 x half> @vfmin_v8f16_vf(<8 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v8f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -226,12 +211,9 @@ define <8 x half> @vfmin_v8f16_fv(<8 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v8f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -277,12 +259,9 @@ define <16 x half> @vfmin_v16f16_vf(<16 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v16f16_vf: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -305,12 +284,9 @@ define <16 x half> @vfmin_v16f16_fv(<16 x half> %a, half %b) { ; ; ZVFHMIN-LABEL: vfmin_v16f16_fv: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll index 64ce0a12de8cf..86f140723d7f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll @@ -61,16 +61,13 @@ define <2 x half> @vfmul_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfmul_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfmul.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -89,16 +86,13 @@ define <2 x half> @vfmul_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfmul_vf_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmul.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -184,16 +178,13 @@ define <4 x half> @vfmul_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfmul_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfmul.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -212,16 +203,13 @@ define <4 x half> @vfmul_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfmul_vf_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfmul.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -284,12 +272,9 @@ define <8 x half> @vfmul_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfmul_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -312,12 +297,9 @@ define <8 x half> @vfmul_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfmul_vf_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -384,12 +366,9 @@ define <16 x half> @vfmul_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32 ; ; ZVFHMIN-LABEL: vfmul_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma @@ -412,12 +391,9 @@ define <16 x half> @vfmul_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe ; ; ZVFHMIN-LABEL: vfmul_vf_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll index eb717a851ed46..d0a0bf516d355 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll @@ -61,16 +61,13 @@ define <2 x half> @vfsub_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfsub_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfsub.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -89,16 +86,13 @@ define <2 x half> @vfsub_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfsub_vf_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfsub.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -184,16 +178,13 @@ define <4 x half> @vfsub_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfsub_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v9, v9, v8, v0.t +; ZVFHMIN-NEXT: vfsub.vv v9, v10, v8, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -212,16 +203,13 @@ define <4 x half> @vfsub_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfsub_vf_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v9, v9, v8 +; ZVFHMIN-NEXT: vfsub.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -284,12 +272,9 @@ define <8 x half> @vfsub_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero ; ; ZVFHMIN-LABEL: vfsub_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -312,12 +297,9 @@ define <8 x half> @vfsub_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext ; ; ZVFHMIN-LABEL: vfsub_vf_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma @@ -384,12 +366,9 @@ define <16 x half> @vfsub_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32 ; ; ZVFHMIN-LABEL: vfsub_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma @@ -412,12 +391,9 @@ define <16 x half> @vfsub_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe ; ; ZVFHMIN-LABEL: vfsub_vf_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll index da7f9f56fcf16..4186a6b304a22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll @@ -24,11 +24,11 @@ define <2 x bfloat> @vpmerge_vv_v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x define <2 x bfloat> @vpmerge_vf_v2bf16(bfloat %a, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpmerge_vf_v2bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa5 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x bfloat> poison, bfloat %a, i32 0 %va = shufflevector <2 x bfloat> %elt.head, <2 x bfloat> poison, <2 x i32> zeroinitializer @@ -52,11 +52,11 @@ define <4 x bfloat> @vpmerge_vv_v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x define <4 x bfloat> @vpmerge_vf_v4bf16(bfloat %a, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpmerge_vf_v4bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa5 -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x bfloat> poison, bfloat %a, i32 0 %va = shufflevector <4 x bfloat> %elt.head, <4 x bfloat> poison, <4 x i32> zeroinitializer @@ -80,11 +80,11 @@ define <8 x bfloat> @vpmerge_vv_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x define <8 x bfloat> @vpmerge_vf_v8bf16(bfloat %a, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpmerge_vf_v8bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa5 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x bfloat> poison, bfloat %a, i32 0 %va = shufflevector <8 x bfloat> %elt.head, <8 x bfloat> poison, <8 x i32> zeroinitializer @@ -108,11 +108,11 @@ define <16 x bfloat> @vpmerge_vv_v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, < define <16 x bfloat> @vpmerge_vf_v16bf16(bfloat %a, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpmerge_vf_v16bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vfmv.v.f v12, fa5 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x bfloat> poison, bfloat %a, i32 0 %va = shufflevector <16 x bfloat> %elt.head, <16 x bfloat> poison, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index 9f0561b394b81..bdf76dc63ddd8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -846,11 +846,11 @@ define <2 x half> @vpmerge_vf_v2f16(half %a, <2 x half> %vb, <2 x i1> %m, i32 ze ; ; ZVFHMIN-LABEL: vpmerge_vf_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <2 x half> poison, half %a, i32 0 %va = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -880,11 +880,11 @@ define <4 x half> @vpmerge_vf_v4f16(half %a, <4 x half> %vb, <4 x i1> %m, i32 ze ; ; ZVFHMIN-LABEL: vpmerge_vf_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <4 x half> poison, half %a, i32 0 %va = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -914,11 +914,11 @@ define <8 x half> @vpmerge_vf_v8f16(half %a, <8 x half> %vb, <8 x i1> %m, i32 ze ; ; ZVFHMIN-LABEL: vpmerge_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10, v0.t +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %a, i32 0 %va = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -948,11 +948,11 @@ define <16 x half> @vpmerge_vf_v16f16(half %a, <16 x half> %vb, <16 x i1> %m, i3 ; ; ZVFHMIN-LABEL: vpmerge_vf_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12, v0.t +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <16 x half> poison, half %a, i32 0 %va = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer From f1615e32379ff1ea125a8b3ac8792c3e0b5e6f2c Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 3 Sep 2024 17:52:08 -0700 Subject: [PATCH 008/425] [WebAssembly] Remove Kind argument from WebAssemblyOperand (NFC) (#107157) The `Kind` argument does not need to passed separately. --- .../AsmParser/WebAssemblyAsmParser.cpp | 56 ++++++++----------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index c63740d267819..24a9ad67cfe04 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -77,16 +77,16 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { struct BrLOp BrL; }; - WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T) - : Kind(K), StartLoc(Start), EndLoc(End), Tok(T) {} - WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, IntOp I) - : Kind(K), StartLoc(Start), EndLoc(End), Int(I) {} - WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, FltOp F) - : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {} - WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S) - : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {} - WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End) - : Kind(K), StartLoc(Start), EndLoc(End), BrL() {} + WebAssemblyOperand(SMLoc Start, SMLoc End, TokOp T) + : Kind(Token), StartLoc(Start), EndLoc(End), Tok(T) {} + WebAssemblyOperand(SMLoc Start, SMLoc End, IntOp I) + : Kind(Integer), StartLoc(Start), EndLoc(End), Int(I) {} + WebAssemblyOperand(SMLoc Start, SMLoc End, FltOp F) + : Kind(Float), StartLoc(Start), EndLoc(End), Flt(F) {} + WebAssemblyOperand(SMLoc Start, SMLoc End, SymOp S) + : Kind(Symbol), StartLoc(Start), EndLoc(End), Sym(S) {} + WebAssemblyOperand(SMLoc Start, SMLoc End) + : Kind(BrList), StartLoc(Start), EndLoc(End), BrL() {} ~WebAssemblyOperand() { if (isBrList()) @@ -388,8 +388,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { if (IsNegative) Val = -Val; Operands.push_back(std::make_unique( - WebAssemblyOperand::Integer, Int.getLoc(), Int.getEndLoc(), - WebAssemblyOperand::IntOp{Val})); + Int.getLoc(), Int.getEndLoc(), WebAssemblyOperand::IntOp{Val})); Parser.Lex(); } @@ -401,8 +400,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { if (IsNegative) Val = -Val; Operands.push_back(std::make_unique( - WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), - WebAssemblyOperand::FltOp{Val})); + Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val})); Parser.Lex(); return false; } @@ -423,8 +421,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { if (IsNegative) Val = -Val; Operands.push_back(std::make_unique( - WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), - WebAssemblyOperand::FltOp{Val})); + Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val})); Parser.Lex(); return false; } @@ -459,8 +456,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { // up later. auto Tok = Lexer.getTok(); Operands.push_back(std::make_unique( - WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(), - WebAssemblyOperand::IntOp{-1})); + Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::IntOp{-1})); } } return false; @@ -474,8 +470,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { NestingStack.back().Sig = Sig; } Operands.push_back(std::make_unique( - WebAssemblyOperand::Integer, NameLoc, NameLoc, - WebAssemblyOperand::IntOp{static_cast(BT)})); + NameLoc, NameLoc, WebAssemblyOperand::IntOp{static_cast(BT)})); } bool parseLimits(wasm::WasmLimits *Limits) { @@ -512,16 +507,14 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64); const auto *Val = MCSymbolRefExpr::create(Sym, getContext()); *Op = std::make_unique( - WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(), - WebAssemblyOperand::SymOp{Val}); + Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::SymOp{Val}); Parser.Lex(); return expect(AsmToken::Comma, ","); } else { const auto *Val = MCSymbolRefExpr::create(DefaultFunctionTable, getContext()); *Op = std::make_unique( - WebAssemblyOperand::Symbol, SMLoc(), SMLoc(), - WebAssemblyOperand::SymOp{Val}); + SMLoc(), SMLoc(), WebAssemblyOperand::SymOp{Val}); return false; } } else { @@ -529,8 +522,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { // write a table symbol or issue relocations. Instead we just ensure the // table is live and write a zero. getStreamer().emitSymbolAttribute(DefaultFunctionTable, MCSA_NoDeadStrip); - *Op = std::make_unique(WebAssemblyOperand::Integer, - SMLoc(), SMLoc(), + *Op = std::make_unique(SMLoc(), SMLoc(), WebAssemblyOperand::IntOp{0}); return false; } @@ -564,7 +556,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { // Now construct the name as first operand. Operands.push_back(std::make_unique( - WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()), + NameLoc, SMLoc::getFromPointer(Name.end()), WebAssemblyOperand::TokOp{Name})); // If this instruction is part of a control flow structure, ensure @@ -645,8 +637,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { const MCExpr *Expr = MCSymbolRefExpr::create( WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx); Operands.push_back(std::make_unique( - WebAssemblyOperand::Symbol, Loc.getLoc(), Loc.getEndLoc(), - WebAssemblyOperand::SymOp{Expr})); + Loc.getLoc(), Loc.getEndLoc(), WebAssemblyOperand::SymOp{Expr})); } while (Lexer.isNot(AsmToken::EndOfStatement)) { @@ -671,8 +662,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { if (Parser.parseExpression(Val, End)) return error("Cannot parse symbol: ", Lexer.getTok()); Operands.push_back(std::make_unique( - WebAssemblyOperand::Symbol, Start, End, - WebAssemblyOperand::SymOp{Val})); + Start, End, WebAssemblyOperand::SymOp{Val})); if (checkForP2AlignIfLoadStore(Operands, Name)) return true; } @@ -705,8 +695,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { } case AsmToken::LCurly: { Parser.Lex(); - auto Op = std::make_unique( - WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc()); + auto Op = + std::make_unique(Tok.getLoc(), Tok.getEndLoc()); if (!Lexer.is(AsmToken::RCurly)) for (;;) { Op->BrL.List.push_back(Lexer.getTok().getIntVal()); From c8763f04bf2162d3f0f4f967dfeb2f0feda0c75b Mon Sep 17 00:00:00 2001 From: Yun-Fly Date: Wed, 4 Sep 2024 09:19:09 +0800 Subject: [PATCH 009/425] [mlir][tensor] Fix consumer fusion for `tensor.pack` without explicit `outer_dims_perm` attribute (#106687) --- mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp | 4 ++-- .../Interfaces/TilingInterface/tile-and-fuse-consumer.mlir | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index f35a9cd4cb927..9e17184ebed79 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -286,7 +286,7 @@ struct PackOpTiling SmallVector outerDimOffsets, outerDimSizes; DenseMap dimAndTileMapping = packOp.getDimAndTileMapping(); - for (auto dim : packOp.getOuterDimsPerm()) { + for (auto dim : llvm::seq(packOp.getSourceRank())) { if (dimAndTileMapping.count(dim)) { FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( @@ -327,7 +327,7 @@ struct PackOpTiling outerDimSizes.push_back(sizes[dim]); } } - + applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); resultOffsets = outerDimOffsets; resultSizes = outerDimSizes; return success(); diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 741dfbfb1cd5c..83c5ec8d7342c 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -337,7 +337,7 @@ module { } } %output = tensor.empty() : tensor<4x32x16xf32> - %pack = tensor.pack %1 outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> + %pack = tensor.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> return %pack : tensor<4x32x16xf32> } } @@ -366,7 +366,7 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]]) // CHECK: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] // CHECK: %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]] -// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] +// CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] From 99f02a874984f2b79c3fbd8ae6bbceb7366521ad Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 4 Sep 2024 05:35:13 +0400 Subject: [PATCH 010/425] [clang] Add tests for CWG issues about language linkage (#107019) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch covers Core issues about language linkage during declaration matching resolved in [P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html), namely [CWG563](https://cplusplus.github.io/CWG/issues/563.html) and [CWG1818](https://cplusplus.github.io/CWG/issues/1818.html). [CWG563](https://cplusplus.github.io/CWG/issues/563.html) "Linkage specification for objects" ----------- [P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html): > [CWG563](https://cplusplus.github.io/CWG/issues/563.html) is resolved by simplifications that follow its suggestions. Wording ([[dcl.link]/5](https://eel.is/c++draft/dcl.link#5)): > In a [linkage-specification](https://eel.is/c++draft/dcl.link#nt:linkage-specification), the specified language linkage applies to the function types of all function declarators and to all functions and variables whose names have external linkage[.](https://eel.is/c++draft/dcl.link#5.sentence-5) Now the wording clearly says that linkage-specification applies to variables with external linkage. [CWG1818](https://cplusplus.github.io/CWG/issues/1818.html) "Visibility and inherited language linkage" ------------ [P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html): > [CWG386](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#386), [CWG1839](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1839), [CWG1818](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1818), [CWG2058](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2058), [CWG1900](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1900), and Richard’s observation in [“are non-type names ignored in a class-head-name or enum-head-name?â€](http://lists.isocpp.org/core/2017/01/1604.php) are resolved by describing the limited lookup that occurs for a declarator-id, including the changes in Richard’s [proposed resolution for CWG1839](http://wiki.edg.com/pub/Wg21cologne2019/CoreWorkingGroup/cwg1839.html) (which also resolves CWG1818 and what of CWG2058 was not resolved along with CWG2059) and rejecting the example from [CWG1477](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1477). Wording ([[dcl.link]/6](https://eel.is/c++draft/dcl.link#6)): > A redeclaration of an entity without a linkage specification inherits the language linkage of the entity and (if applicable) its type[.](https://eel.is/c++draft/dcl.link#6.sentence-2). Answer to the question in the example is `extern "C"`, and not linkage mismatch. Further analysis of the example is provided as inline comments in the test itself. Note that https://eel.is/c++draft/dcl.link#7 does NOT apply in this example, as it's focused squarely at declarations that are already known to have C language linkage, and declarations of variables in the global scope. --- clang/test/CXX/drs/cwg1818.cpp | 34 ++++++++++++++++++++++++++++++++++ clang/test/CXX/drs/cwg18xx.cpp | 2 ++ clang/test/CXX/drs/cwg563.cpp | 16 ++++++++++++++++ clang/test/CXX/drs/cwg5xx.cpp | 1 + clang/www/cxx_dr_status.html | 4 ++-- 5 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 clang/test/CXX/drs/cwg1818.cpp create mode 100644 clang/test/CXX/drs/cwg563.cpp diff --git a/clang/test/CXX/drs/cwg1818.cpp b/clang/test/CXX/drs/cwg1818.cpp new file mode 100644 index 0000000000000..bf2d12696a729 --- /dev/null +++ b/clang/test/CXX/drs/cwg1818.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s + +// expected-no-diagnostics + +namespace cwg1818 { // cwg1818: 3.4 +extern "C" void f() { + // This declaration binds name 'g' in the scope of function 'f', + // but its target scope corresponds to namespace 'cwg1818' (_N4988_.[dcl.meaning]/3.5). + // Linkage specification of 'f' applies to 'g' per _N4988_.[dcl.link]/5. + void g(); +} +// Target scope of this declaration is naturally the one +// that corresponds to namespace 'cwg1818', +// which makes it declare the same entity +// as the previous declaration per _N4988_.[basic.link]/8, +// turning it into a redeclaration per _N4988_.[basic.def]/1. +// Then _N4988_.[dcl.link]/6 applies, making it inherit +// the (С) language linkage of the previous declaration. +void g(); +} // namespace cwg1818 + +// Check that the former 'g' has C language linkage, +// then that the latter 'g' is considered to be a redeclaration of it, +// which would make the latter 'g' inherit C language linkage from the former 'g'. + +// CHECK: LinkageSpecDecl [[LINKAGE_DECL:0x[0-9a-f]+]] {{.*}} C +// CHECK: FunctionDecl [[FIRST_DECL:0x[0-9a-f]+]] parent [[LINKAGE_DECL]] {{.*}} g 'void ()' +// CHECK: FunctionDecl {{.*}} prev [[FIRST_DECL]] {{.*}} g 'void ()' diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp index adfdb738e81c9..61b7faa96a9fb 100644 --- a/clang/test/CXX/drs/cwg18xx.cpp +++ b/clang/test/CXX/drs/cwg18xx.cpp @@ -222,6 +222,8 @@ namespace cwg1815 { // cwg1815: no #endif } +// cwg1818 is in cwg1818.cpp + namespace cwg1820 { // cwg1820: 3.5 typedef int A; typedef int cwg1820::A; diff --git a/clang/test/CXX/drs/cwg563.cpp b/clang/test/CXX/drs/cwg563.cpp new file mode 100644 index 0000000000000..d585fefc44ffc --- /dev/null +++ b/clang/test/CXX/drs/cwg563.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s + +// expected-no-diagnostics + +namespace cwg563 { // cwg563: 3.3 +extern "C" int a; +} // namespace cwg563 + +// CHECK: LinkageSpecDecl {{.*}} C +// CHECK-NEXT: `-VarDecl {{.*}} a 'int' diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp index 6a0bb7a196669..ed0c7159dfc88 100644 --- a/clang/test/CXX/drs/cwg5xx.cpp +++ b/clang/test/CXX/drs/cwg5xx.cpp @@ -799,6 +799,7 @@ namespace cwg561 { // cwg561: yes } // cwg562: na +// cwg563 is in cwg563.cpp namespace cwg564 { // cwg564: yes extern "C++" void f(int); diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index ca25776823cfa..aa79c3706f32b 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -3431,7 +3431,7 @@

C++ defect report implementation status

563 CD6 Linkage specification for objects - Unknown + Clang 3.3 564 @@ -10735,7 +10735,7 @@

C++ defect report implementation status

1818 CD6 Visibility and inherited language linkage - Unknown + Clang 3.4 1819 From b057e16740311b9c690c0c991c48b5087bf24d9a Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Tue, 3 Sep 2024 18:44:32 -0700 Subject: [PATCH 011/425] [IR] Remove unused MINARITY operand trait tpl args, NFC (#107165) These don't look like they've been used since the original 'use-diet' branch was merged in 2008 ( f6caff66a1bfa6464e6a17c0bcfcf06a09a9b909) --- llvm/include/llvm/Analysis/MemorySSA.h | 2 +- llvm/include/llvm/IR/Constants.h | 2 +- llvm/include/llvm/IR/Function.h | 3 +-- llvm/include/llvm/IR/InstrTypes.h | 4 ++-- llvm/include/llvm/IR/Instructions.h | 29 +++++++++----------------- llvm/include/llvm/IR/OperandTraits.h | 4 +--- llvm/include/llvm/IR/Operator.h | 4 ++-- llvm/include/llvm/IR/User.h | 1 - llvm/lib/IR/ConstantsContext.h | 2 +- 9 files changed, 19 insertions(+), 32 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index 43fea6ba27ec4..c5eff151ca418 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -689,7 +689,7 @@ inline void MemoryUseOrDef::resetOptimized() { cast(this)->resetOptimized(); } -template <> struct OperandTraits : public HungoffOperandTraits<2> {}; +template <> struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess) /// Encapsulates MemorySSA, including all data associated with memory diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index 2788751e8b62a..62ccde96e5397 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -1381,7 +1381,7 @@ class ConstantExpr : public Constant { template <> struct OperandTraits - : public VariadicOperandTraits {}; + : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant) diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index 4abf978687d9d..ce48eed7883e6 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -1038,8 +1038,7 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject, /// Return value: true => null pointer dereference is not undefined. bool NullPointerIsDefined(const Function *F, unsigned AS = 0); -template <> -struct OperandTraits : public HungoffOperandTraits<3> {}; +template <> struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(Function, Value) diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index afae564bf022d..6fddedd86e97b 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -2407,7 +2407,7 @@ class CallBase : public Instruction { }; template <> -struct OperandTraits : public VariadicOperandTraits {}; +struct OperandTraits : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value) @@ -2474,7 +2474,7 @@ class FuncletPadInst : public Instruction { template <> struct OperandTraits - : public VariadicOperandTraits {}; + : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value) diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index dbd7d49a3e767..6257d03458cab 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1112,9 +1112,8 @@ class GetElementPtrInst : public Instruction { }; template <> -struct OperandTraits : - public VariadicOperandTraits { -}; +struct OperandTraits + : public VariadicOperandTraits {}; GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr, ArrayRef IdxList, unsigned Values, @@ -2723,9 +2722,7 @@ class PHINode : public Instruction { void growOperands(); }; -template <> -struct OperandTraits : public HungoffOperandTraits<2> { -}; +template <> struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value) @@ -2825,8 +2822,7 @@ class LandingPadInst : public Instruction { }; template <> -struct OperandTraits : public HungoffOperandTraits<1> { -}; +struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value) @@ -2903,8 +2899,7 @@ class ReturnInst : public Instruction { }; template <> -struct OperandTraits : public VariadicOperandTraits { -}; +struct OperandTraits : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReturnInst, Value) @@ -3039,8 +3034,7 @@ class BranchInst : public Instruction { }; template <> -struct OperandTraits : public VariadicOperandTraits { -}; +struct OperandTraits : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value) @@ -3428,9 +3422,7 @@ class SwitchInstProfUpdateWrapper { static CaseWeightOpt getSuccessorWeight(const SwitchInst &SI, unsigned idx); }; -template <> -struct OperandTraits : public HungoffOperandTraits<2> { -}; +template <> struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value) @@ -3554,8 +3546,7 @@ class IndirectBrInst : public Instruction { }; template <> -struct OperandTraits : public HungoffOperandTraits<1> { -}; +struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(IndirectBrInst, Value) @@ -4105,7 +4096,7 @@ class CatchSwitchInst : public Instruction { }; template <> -struct OperandTraits : public HungoffOperandTraits<2> {}; +struct OperandTraits : public HungoffOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value) @@ -4337,7 +4328,7 @@ class CleanupReturnInst : public Instruction { template <> struct OperandTraits - : public VariadicOperandTraits {}; + : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value) diff --git a/llvm/include/llvm/IR/OperandTraits.h b/llvm/include/llvm/IR/OperandTraits.h index ffece6324aab0..efc3d7838feaf 100644 --- a/llvm/include/llvm/IR/OperandTraits.h +++ b/llvm/include/llvm/IR/OperandTraits.h @@ -64,8 +64,7 @@ struct OptionalOperandTraits : public FixedNumOperandTraits { /// when it is a prefix to the User object, and the number of Use objects is /// only known at allocation time. -template -struct VariadicOperandTraits { +template struct VariadicOperandTraits { static Use *op_begin(SubClass* U) { static_assert( !std::is_polymorphic::value, @@ -91,7 +90,6 @@ struct VariadicOperandTraits { /// This is the traits class that is needed when the Use array must be /// resizable. -template struct HungoffOperandTraits { static Use *op_begin(User* U) { return U->getHungOffOperands(); diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h index f63f54ef94107..88b9bfc0be4b1 100644 --- a/llvm/include/llvm/IR/Operator.h +++ b/llvm/include/llvm/IR/Operator.h @@ -533,8 +533,8 @@ class GEPOperator }; template <> -struct OperandTraits - : public VariadicOperandTraits {}; +struct OperandTraits : public VariadicOperandTraits { +}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GEPOperator, Value) diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h index a9cf60151e5dc..910815f236abe 100644 --- a/llvm/include/llvm/IR/User.h +++ b/llvm/include/llvm/IR/User.h @@ -42,7 +42,6 @@ template struct OperandTraits; class User : public Value { - template friend struct HungoffOperandTraits; LLVM_ATTRIBUTE_ALWAYS_INLINE static void * diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index acf05379e8a2a..bd19ec6b9dcac 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -246,7 +246,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value) template <> struct OperandTraits - : public VariadicOperandTraits {}; + : public VariadicOperandTraits {}; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrConstantExpr, Value) From ed220e15718498d0f854f1044ddcbfee00739aa7 Mon Sep 17 00:00:00 2001 From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:46:02 +0800 Subject: [PATCH 012/425] [VPlan][NFC] Implement `VPWidenMemoryRecipe::computeCost()`. (#105614) In this patch, we implement the `computeCost()` function in `VPWidenMemoryRecipe`. --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 46 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bd71dbffa929e..9ad98a5371d81 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2539,6 +2539,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase { llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); } + /// Return the cost of this VPWidenMemoryRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + Instruction &getIngredient() const { return Ingredient; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 69c76edd0f554..49ed733107da9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -272,6 +272,12 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) { return dyn_cast_or_null(S->getUnderlyingValue()); if (auto *IG = dyn_cast(R)) return IG->getInsertPos(); + // Currently the legacy cost model only calculates the instruction cost with + // underlying instruction. Removing the WidenMem here will prevent + // force-target-instruction-cost overwriting the cost of recipe with + // underlying instruction which is inconsistent with the legacy model. + // TODO: Remove WidenMem from this function when we don't need to compare to + // the legacy model. if (auto *WidenMem = dyn_cast(R)) return &WidenMem->getIngredient(); return nullptr; @@ -2132,6 +2138,46 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + const Align Alignment = + getLoadStoreAlignment(const_cast(&Ingredient)); + unsigned AS = + getLoadStoreAddressSpace(const_cast(&Ingredient)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + if (!Consecutive) { + // TODO: Using the original IR may not be accurate. + // Currently, ARM will use the underlying IR to calculate gather/scatter + // instruction cost. + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + assert(!Reverse && + "Inconsecutive memory access should not have the order."); + return Ctx.TTI.getAddressComputationCost(Ty) + + Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, + IsMasked, Alignment, CostKind, + &Ingredient); + } + + InstructionCost Cost = 0; + if (IsMasked) { + Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, + AS, CostKind); + } else { + TTI::OperandValueInfo OpInfo = + Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); + Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, + CostKind, OpInfo, &Ingredient); + } + if (!Reverse) + return Cost; + + return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, + cast(Ty), std::nullopt, + CostKind, 0); +} + void VPWidenLoadRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); From 9b5971ad0355d43a9bd37b1067d93ff8b08eba81 Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Tue, 3 Sep 2024 18:55:23 -0700 Subject: [PATCH 013/425] [AArch64][GlobalISel] Lower G_BUILD_VECTOR to G_INSERT_VECTOR_ELT (#105686) The lowering happens in post-legalizer lowering if any source registers from G_BUILD_VECTOR are not constants. Add pattern pragment setting `scalar_to_vector ($src)` asequivalent to `vector_insert (undef), ($src), (i61 0)` --- llvm/lib/Target/AArch64/AArch64Combine.td | 10 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 73 +- .../GISel/AArch64InstructionSelector.cpp | 15 + .../GISel/AArch64PostLegalizerLowering.cpp | 34 + .../legalize-shuffle-vector-widen-crash.ll | 12 +- ...legalizer-lowering-build-vector-to-dup.mir | 24 +- .../postlegalizer-lowering-shuffle-splat.mir | 51 +- llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll | 2 +- llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 2 +- llvm/test/CodeGen/AArch64/aarch64-smull.ll | 116 ++-- llvm/test/CodeGen/AArch64/abs.ll | 8 +- llvm/test/CodeGen/AArch64/add.ll | 100 ++- llvm/test/CodeGen/AArch64/andorxor.ll | 300 ++++----- llvm/test/CodeGen/AArch64/arm64-dup.ll | 61 +- .../AArch64/arm64-extract-insert-varidx.ll | 45 +- .../AArch64/arm64-indexed-vector-ldst.ll | 82 +-- llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 318 ++++----- .../CodeGen/AArch64/arm64-subvector-extend.ll | 372 ++++------- llvm/test/CodeGen/AArch64/arm64-tbl.ll | 159 +++-- llvm/test/CodeGen/AArch64/bitcast.ll | 106 +-- llvm/test/CodeGen/AArch64/bswap.ll | 6 +- llvm/test/CodeGen/AArch64/concat-vector.ll | 178 +++-- llvm/test/CodeGen/AArch64/fabs.ll | 32 +- llvm/test/CodeGen/AArch64/faddsub.ll | 80 +-- llvm/test/CodeGen/AArch64/fcmp.ll | 334 +++++---- llvm/test/CodeGen/AArch64/fcopysign.ll | 34 +- llvm/test/CodeGen/AArch64/fcvt.ll | 224 +++---- llvm/test/CodeGen/AArch64/fdiv.ll | 40 +- llvm/test/CodeGen/AArch64/fexplog.ll | 315 ++++----- .../AArch64/fixed-vector-interleave.ll | 14 +- llvm/test/CodeGen/AArch64/fminimummaximum.ll | 80 +-- llvm/test/CodeGen/AArch64/fminmax.ll | 80 +-- llvm/test/CodeGen/AArch64/fmla.ll | 168 ++--- llvm/test/CodeGen/AArch64/fmul.ll | 40 +- llvm/test/CodeGen/AArch64/fneg.ll | 32 +- llvm/test/CodeGen/AArch64/fpow.ll | 12 +- llvm/test/CodeGen/AArch64/fpowi.ll | 12 +- llvm/test/CodeGen/AArch64/fptoi.ll | 70 +- llvm/test/CodeGen/AArch64/fptrunc.ll | 12 +- llvm/test/CodeGen/AArch64/frem.ll | 12 +- llvm/test/CodeGen/AArch64/fsincos.ll | 126 ++-- llvm/test/CodeGen/AArch64/fsqrt.ll | 32 +- llvm/test/CodeGen/AArch64/icmp.ll | 16 +- llvm/test/CodeGen/AArch64/insertextract.ll | 47 +- llvm/test/CodeGen/AArch64/itofp.ll | 180 +++-- llvm/test/CodeGen/AArch64/llvm.exp10.ll | 10 +- llvm/test/CodeGen/AArch64/load.ll | 48 +- llvm/test/CodeGen/AArch64/mul.ll | 134 ++-- .../AArch64/neon-bitwise-instructions.ll | 34 +- .../AArch64/neon-compare-instructions.ll | 12 +- llvm/test/CodeGen/AArch64/neon-extadd.ll | 358 +++++----- llvm/test/CodeGen/AArch64/neon-extmul.ll | 28 +- llvm/test/CodeGen/AArch64/neon-perm.ll | 13 +- llvm/test/CodeGen/AArch64/ptradd.ll | 52 +- llvm/test/CodeGen/AArch64/rem.ll | 632 +++++++++--------- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 38 +- llvm/test/CodeGen/AArch64/sext.ll | 206 +++--- llvm/test/CodeGen/AArch64/shift.ll | 177 ++--- llvm/test/CodeGen/AArch64/shufflevector.ll | 70 +- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 38 +- llvm/test/CodeGen/AArch64/sub.ll | 100 ++- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 38 +- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 38 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 528 ++++++--------- llvm/test/CodeGen/AArch64/xtn.ll | 77 ++- llvm/test/CodeGen/AArch64/zext.ll | 150 ++--- 66 files changed, 3200 insertions(+), 3647 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index d12f834da5a15..f99d1e276c60f 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -222,7 +222,15 @@ def build_vector_to_dup : GICombineRule< (apply [{ applyBuildVectorToDup(*${root}, MRI, B); }]) >; -def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>; +def build_vector_to_vector_insert : GICombineRule< + (defs root:$root, register_matchinfo:$matchinfo), + (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root, + [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]), + (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }]) +>; + +def build_vector_lowering : GICombineGroup<[build_vector_to_dup, + build_vector_to_vector_insert]>; def lower_vector_fcmp : GICombineRule< (defs root:$root), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2fff6fffcd7c6..c659697c3a1be 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3315,6 +3315,10 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>; // Pre-fetch. defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">; +def vec_ins_or_scal_vec : PatFrags<(ops node:$src), + [(vector_insert undef, node:$src, (i64 0)), + (scalar_to_vector node:$src)]>; + // For regular load, we do not have any alignment requirement. // Thus, it is safe to directly map the vector loads with interesting // addressing modes. @@ -3323,13 +3327,13 @@ multiclass ScalToVecROLoadPat { - def : Pat<(VecTy (scalar_to_vector (ScalTy + def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))), (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset), sub)>; - def : Pat<(VecTy (scalar_to_vector (ScalTy + def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))), (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset), @@ -3357,12 +3361,12 @@ defm : ScalToVecROLoadPat; defm : ScalToVecROLoadPat; -def : Pat <(v1i64 (scalar_to_vector (i64 +def : Pat <(v1i64 (vec_ins_or_scal_vec (i64 (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend))))), (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; -def : Pat <(v1i64 (scalar_to_vector (i64 +def : Pat <(v1i64 (vec_ins_or_scal_vec (i64 (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend))))), (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; @@ -3495,34 +3499,34 @@ def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), // Thus, it is safe to directly map the vector loads with interesting // addressing modes. // FIXME: We could do the same for bitconvert to floating point vectors. -def : Pat <(v8i8 (scalar_to_vector (i32 +def : Pat <(v8i8 (vec_ins_or_scal_vec (i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v16i8 (scalar_to_vector (i32 +def : Pat <(v16i8 (vec_ins_or_scal_vec (i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v4i16 (scalar_to_vector (i32 +def : Pat <(v4i16 (vec_ins_or_scal_vec (i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v8i16 (scalar_to_vector (i32 +def : Pat <(v8i16 (vec_ins_or_scal_vec (i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v2i32 (scalar_to_vector (i32 +def : Pat <(v2i32 (vec_ins_or_scal_vec (i32 (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v4i32 (scalar_to_vector (i32 +def : Pat <(v4i32 (vec_ins_or_scal_vec (i32 (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v1i64 (scalar_to_vector (i64 +def : Pat <(v1i64 (vec_ins_or_scal_vec (i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat <(v2i64 (scalar_to_vector (i64 +def : Pat <(v2i64 (vec_ins_or_scal_vec (i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>; @@ -6848,10 +6852,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), defm INS : SIMDIns; -def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; @@ -6859,50 +6863,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))), (SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>; -def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), +def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; -def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), +def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))), (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; - -def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), +def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; -def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), +def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), +def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))), (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), +def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), @@ -8507,7 +8510,7 @@ def : Ld1Lane64IdxOpPat - : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))), + : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))), (ResultTy (EXTRACT_SUBREG (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>; @@ -8940,11 +8943,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), +def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), +def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; +def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index e9e6b6cb68d0d..18361cf368564 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2116,6 +2116,21 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { I.getOperand(1).setReg(NewSrc.getReg(0)); return true; } + case AArch64::G_INSERT_VECTOR_ELT: { + // Convert the type from p0 to s64 to help selection. + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg()); + if (!SrcVecTy.isPointerVector()) + return false; + auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg()); + MRI.setType(I.getOperand(1).getReg(), + DstTy.changeElementType(LLT::scalar(64))); + MRI.setType(I.getOperand(0).getReg(), + DstTy.changeElementType(LLT::scalar(64))); + MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); + I.getOperand(2).setReg(NewSrc.getReg(0)); + return true; + } case TargetOpcode::G_UITOFP: case TargetOpcode::G_SITOFP: { // If both source and destination regbanks are FPR, then convert the opcode diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 90ac4bdff4e0e..b40fe55fdfaf6 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -1054,6 +1054,40 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } +// Matches G_BUILD_VECTOR where at least one source operand is not a constant +bool matchLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI) { + auto *GBuildVec = cast(&MI); + + // Check if the values are all constants + for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) { + auto ConstVal = + getAnyConstantVRegValWithLookThrough(GBuildVec->getSourceReg(I), MRI); + + if (!ConstVal.has_value()) + return true; + } + + return false; +} + +void applyLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) { + auto *GBuildVec = cast(&MI); + LLT DstTy = MRI.getType(GBuildVec->getReg(0)); + Register DstReg = B.buildUndef(DstTy).getReg(0); + + for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) { + Register SrcReg = GBuildVec->getSourceReg(I); + if (mi_match(SrcReg, MRI, m_GImplicitDef())) + continue; + auto IdxReg = B.buildConstant(LLT::scalar(64), I); + DstReg = + B.buildInsertVectorElement(DstTy, DstReg, SrcReg, IdxReg).getReg(0); + } + B.buildCopy(GBuildVec->getReg(0), DstReg); + GBuildVec->eraseFromParent(); +} + bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI, Register &SrcReg) { assert(MI.getOpcode() == TargetOpcode::G_STORE); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll index f7efaeaa50705..87c1307ad2955 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll @@ -10,12 +10,14 @@ define i32 @bar() { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: mov b1, v0[1] -; CHECK-NEXT: mov b2, v0[2] -; CHECK-NEXT: mov b3, v0[3] -; CHECK-NEXT: mov.h v0[1], v1[0] -; CHECK-NEXT: mov.h v2[1], v3[0] +; CHECK-NEXT: mov b2, v0[3] +; CHECK-NEXT: mov b3, v0[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov.h v0[1], w8 +; CHECK-NEXT: mov.h v3[1], w9 ; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll.4s v1, v2, #0 +; CHECK-NEXT: ushll.4s v1, v3, #0 ; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: movi.4s v1, #1 ; CHECK-NEXT: and.16b v0, v0, v1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir index 70867c2ea2842..0115531dfb09a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir @@ -42,20 +42,30 @@ body: | ; LOWER-NEXT: {{ $}} ; LOWER-NEXT: %r:_(s32) = COPY $w0 ; LOWER-NEXT: %q:_(s32) = COPY $w1 - ; LOWER-NEXT: %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r(s32), %q(s32) + ; LOWER-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; LOWER-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; LOWER-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %r(s32), [[C]](s64) + ; LOWER-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; LOWER-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %q(s32), [[C1]](s64) + ; LOWER-NEXT: %build_vector:_(<2 x s32>) = COPY [[IVEC1]](<2 x s32>) ; LOWER-NEXT: $d0 = COPY %build_vector(<2 x s32>) ; LOWER-NEXT: RET_ReallyLR implicit $d0 ; ; SELECT-LABEL: name: dont_combine_different_reg ; SELECT: liveins: $d0, $w0, $w1 ; SELECT-NEXT: {{ $}} - ; SELECT-NEXT: %r:gpr32all = COPY $w0 + ; SELECT-NEXT: %r:gpr32 = COPY $w0 ; SELECT-NEXT: %q:gpr32 = COPY $w1 - ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub - ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %q - ; SELECT-NEXT: %build_vector:fpr64 = COPY [[INSvi32gpr]].dsub - ; SELECT-NEXT: $d0 = COPY %build_vector + ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF + ; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub + ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r + ; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub + ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub + ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q + ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub + ; SELECT-NEXT: $d0 = COPY [[COPY1]] ; SELECT-NEXT: RET_ReallyLR implicit $d0 %r:_(s32) = COPY $w0 %q:_(s32) = COPY $w1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir index 71094825e42f3..7c7689bcb80b5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir @@ -355,7 +355,21 @@ body: | ; CHECK: liveins: $w0, $w1, $w2, $w3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lane:_(s32) = COPY $w0 - ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUP %lane(s32) + ; CHECK-NEXT: %b:_(s32) = COPY $w1 + ; CHECK-NEXT: %c:_(s32) = COPY $w2 + ; CHECK-NEXT: %d:_(s32) = COPY $w3 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane(s32), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64) + ; CHECK-NEXT: %buildvec:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUPLANE32 %buildvec, [[C4]](s64) ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lane:_(s32) = COPY $w0 @@ -367,7 +381,7 @@ body: | %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec(<4 x s32>), %undef, shufflemask(0, 0, 0, 0) $q0 = COPY %shuf(<4 x s32>) RET_ReallyLR implicit $q0 - + ... --- name: build_vector_rhs @@ -382,10 +396,35 @@ body: | ; ; CHECK-LABEL: name: build_vector ; CHECK: liveins: $w0, $w1, $w2, $w3, $w4 - ; CHECK: %lane_1:_(s32) = COPY $w1 - ; CHECK: %shuf:_(<4 x s32>) = G_DUP %lane_1(s32) - ; CHECK: $q0 = COPY %shuf(<4 x s32>) - ; CHECK: RET_ReallyLR implicit $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lane_0:_(s32) = COPY $w0 + ; CHECK-NEXT: %lane_1:_(s32) = COPY $w1 + ; CHECK-NEXT: %b:_(s32) = COPY $w2 + ; CHECK-NEXT: %c:_(s32) = COPY $w3 + ; CHECK-NEXT: %d:_(s32) = COPY $w4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane_0(s32), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64) + ; CHECK-NEXT: %buildvec0:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC4:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], %lane_1(s32), [[C4]](s64) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC5:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC4]], %b(s32), [[C5]](s64) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[IVEC6:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC5]], %c(s32), [[C6]](s64) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[IVEC7:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC6]], %d(s32), [[C7]](s64) + ; CHECK-NEXT: %buildvec1:_(<4 x s32>) = COPY [[IVEC7]](<4 x s32>) + ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec0(<4 x s32>), %buildvec1, shufflemask(4, 4, 4, 4) + ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lane_0:_(s32) = COPY $w0 %lane_1:_(s32) = COPY $w1 %b:_(s32) = COPY $w2 diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll index f47da47002fbc..9734ab35bd6b2 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll @@ -76,7 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { ; CHECK-GI-NEXT: bic w9, w9, w8 ; CHECK-GI-NEXT: and w8, w8, w10 ; CHECK-GI-NEXT: orr w8, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %neg = xor <1 x i32> %C, diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll index 6431cfc58a54d..45ad4b07ff66f 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -76,7 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { ; CHECK-GI-NEXT: and w9, w8, w9 ; CHECK-GI-NEXT: bic w8, w10, w8 ; CHECK-GI-NEXT: orr w8, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %and = and <1 x i32> %C, %B diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 307aa397eabbb..d677526bab000 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -209,24 +209,22 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: ldr w8, [x0] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: uxtb w8, w8 -; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: mov b2, v0.b[1] ; CHECK-GI-NEXT: mov b3, v0.b[3] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: fmov w10, s2 ; CHECK-GI-NEXT: fmov w11, s3 +; CHECK-GI-NEXT: ldr d2, [x1] ; CHECK-GI-NEXT: uxtb w9, w9 ; CHECK-GI-NEXT: uxtb w10, w10 ; CHECK-GI-NEXT: uxtb w11, w11 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 -; CHECK-GI-NEXT: ldr d2, [x1] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s @@ -269,25 +267,25 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h1, [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x0] ; CHECK-GI-NEXT: ldr h2, [x0, #2] ; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: ldr d0, [x1] -; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: mov x9, v0.d[1] -; CHECK-GI-NEXT: fmov x10, d1 -; CHECK-GI-NEXT: mov x8, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mov x11, v1.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov x10, v0.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %load.A = load <2 x i16>, ptr %A %load.B = load <2 x i32>, ptr %B @@ -322,14 +320,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x11, v1.d[1] +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov x10, v0.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %load.A = load <2 x i32>, ptr %A %and.A = and <2 x i32> %load.A, @@ -1048,14 +1046,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI36_0 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v1.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %tmp3 = sext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1163,14 +1161,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI40_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v1.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1264,15 +1262,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI43_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v1.d[1] ; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> @@ -1891,15 +1889,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov x10, v0.d[1] ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x11, v1.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64> @@ -1947,10 +1945,10 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-GI-NEXT: fmov x9, d0 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x9, x9, x12 -; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mov v0.d[0], x8 ; CHECK-GI-NEXT: mul x11, x13, x14 +; CHECK-GI-NEXT: mov v1.d[0], x9 ; CHECK-GI-NEXT: mov v0.d[1], x10 -; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret entry: @@ -1992,9 +1990,9 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: mul x9, x12, x9 ; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mov v0.d[0], x8 ; CHECK-GI-NEXT: mul x11, x13, x11 -; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[0], x9 ; CHECK-GI-NEXT: mov v0.d[1], x10 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll index 78c1ff7b99370..6da019a79b727 100644 --- a/llvm/test/CodeGen/AArch64/abs.ll +++ b/llvm/test/CodeGen/AArch64/abs.ll @@ -247,7 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){ ; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: cmp w8, #0 ; CHECK-GI-NEXT: cneg w8, w9, le -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -299,10 +299,8 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){ ; CHECK-GI-LABEL: abs_v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w2 ; CHECK-GI-NEXT: abs v0.8b, v0.8b ; CHECK-GI-NEXT: umov w0, v0.b[0] ; CHECK-GI-NEXT: umov w1, v0.b[1] diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index ee15445a7bbd6..fc1a0c71d4cdf 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 1176c98ce44e3..5385a917619fa 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -191,13 +191,13 @@ define void @and_v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: and_v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -228,13 +228,13 @@ define void @or_v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: or_v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -265,13 +265,13 @@ define void @xor_v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: xor_v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -306,22 +306,18 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: and_v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -358,22 +354,18 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: or_v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -410,22 +402,18 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: xor_v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -459,27 +447,27 @@ define void @and_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -510,27 +498,27 @@ define void @or_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -561,27 +549,27 @@ define void @xor_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -723,13 +711,13 @@ define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: and_v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -762,13 +750,13 @@ define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: or_v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -801,13 +789,13 @@ define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: xor_v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -836,18 +824,16 @@ define void @and_v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: and_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] @@ -875,18 +861,16 @@ define void @or_v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: or_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] @@ -914,18 +898,16 @@ define void @xor_v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: xor_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll index 0291f8c912304..a25763e3b1590 100644 --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -334,25 +334,40 @@ entry: } define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone { -; CHECK-LABEL: f: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov.s v0[1], w1 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: f: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: mov.s v0[1], w1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: f: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov.s v0[0], w0 +; CHECK-GI-NEXT: mov.s v0[1], w1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1 ret <2 x i32> %vecinit1 } define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { -; CHECK-LABEL: g: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov.s v0[1], w1 -; CHECK-NEXT: mov.s v0[2], w1 -; CHECK-NEXT: mov.s v0[3], w0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: g: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: mov.s v0[1], w1 +; CHECK-SD-NEXT: mov.s v0[2], w1 +; CHECK-SD-NEXT: mov.s v0[3], w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: g: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov.s v0[0], w0 +; CHECK-GI-NEXT: mov.s v0[1], w1 +; CHECK-GI-NEXT: mov.s v0[2], w1 +; CHECK-GI-NEXT: mov.s v0[3], w0 +; CHECK-GI-NEXT: ret %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2 @@ -361,11 +376,17 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { } define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone { -; CHECK-LABEL: h: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: mov.d v0[1], x1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: mov.d v0[1], x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov.d v0[0], x0 +; CHECK-GI-NEXT: mov.d v0[1], x1 +; CHECK-GI-NEXT: ret %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 ret <2 x i64> %vecinit1 @@ -386,8 +407,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) { ; ; CHECK-GI-LABEL: test_build_illegal: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov s0, v0[3] -; CHECK-GI-NEXT: mov.h v0[3], v0[0] +; CHECK-GI-NEXT: mov.s w8, v0[3] +; CHECK-GI-NEXT: mov.h v0[3], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %val = extractelement <4 x i32> %in, i32 3 diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll index bc399c8d4ff07..8611532d6ea92 100644 --- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll +++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll @@ -29,19 +29,20 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) { ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GISEL-NEXT: mov b1, v0.b[1] ; CHECK-GISEL-NEXT: add x8, sp, #8 -; CHECK-GISEL-NEXT: and x9, x9, #0x7 ; CHECK-GISEL-NEXT: str d0, [sp, #8] +; CHECK-GISEL-NEXT: and x9, x9, #0x7 +; CHECK-GISEL-NEXT: mov b2, v0.b[1] ; CHECK-GISEL-NEXT: mov b3, v0.b[2] ; CHECK-GISEL-NEXT: lsl x10, x9, #1 ; CHECK-GISEL-NEXT: mov b0, v0.b[3] ; CHECK-GISEL-NEXT: sub x9, x10, x9 -; CHECK-GISEL-NEXT: ldr b2, [x8, x9] -; CHECK-GISEL-NEXT: mov v2.b[1], v1.b[0] -; CHECK-GISEL-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GISEL-NEXT: mov v2.b[3], v0.b[0] -; CHECK-GISEL-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-GISEL-NEXT: ldr b1, [x8, x9] +; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0] +; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0] +; CHECK-GISEL-NEXT: mov v1.b[3], v0.b[0] +; CHECK-GISEL-NEXT: ushll v0.8h, v1.8b, #0 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret @@ -82,14 +83,15 @@ define <8 x i8> @test_varidx_extract_v16s8(<16 x i8> %x, i32 %idx) { ; CHECK-GISEL-NEXT: sub sp, sp, #16 ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 -; CHECK-GISEL-NEXT: mov b2, v0.b[1] ; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and x9, x9, #0xf ; CHECK-GISEL-NEXT: str q0, [sp] +; CHECK-GISEL-NEXT: and x9, x9, #0xf +; CHECK-GISEL-NEXT: mov b2, v0.b[1] ; CHECK-GISEL-NEXT: mov b3, v0.b[2] ; CHECK-GISEL-NEXT: lsl x10, x9, #1 ; CHECK-GISEL-NEXT: sub x9, x10, x9 ; CHECK-GISEL-NEXT: ldr b1, [x8, x9] +; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0] ; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0] ; CHECK-GISEL-NEXT: mov b2, v0.b[3] ; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0] @@ -176,15 +178,14 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) { ; CHECK-GISEL: // %bb.0: ; CHECK-GISEL-NEXT: sub sp, sp, #16 ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GISEL-NEXT: mov w9, w0 -; CHECK-GISEL-NEXT: mov h1, v0.h[1] +; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GISEL-NEXT: add x8, sp, #8 ; CHECK-GISEL-NEXT: str d0, [sp, #8] ; CHECK-GISEL-NEXT: and x9, x9, #0x3 -; CHECK-GISEL-NEXT: ldr h0, [x8, x9, lsl #1] -; CHECK-GISEL-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GISEL-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GISEL-NEXT: ldr h1, [x8, x9, lsl #1] +; CHECK-GISEL-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GISEL-NEXT: ushll v0.4s, v1.4h, #0 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret @@ -217,16 +218,13 @@ define <4 x i16> @test_varidx_extract_v8s16(<8 x i16> %x, i32 %idx) { ; CHECK-GISEL-NEXT: sub sp, sp, #16 ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 -; CHECK-GISEL-NEXT: mov h2, v0.h[1] ; CHECK-GISEL-NEXT: mov x8, sp ; CHECK-GISEL-NEXT: str q0, [sp] ; CHECK-GISEL-NEXT: and x9, x9, #0x7 -; CHECK-GISEL-NEXT: mov h3, v0.h[2] ; CHECK-GISEL-NEXT: ldr h1, [x8, x9, lsl #1] -; CHECK-GISEL-NEXT: mov h0, v0.h[3] -; CHECK-GISEL-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GISEL-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GISEL-NEXT: mov v1.h[3], v0.h[0] +; CHECK-GISEL-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GISEL-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GISEL-NEXT: mov v1.h[3], v0.h[3] ; CHECK-GISEL-NEXT: fmov d0, d1 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret @@ -289,13 +287,12 @@ define <2 x i32> @test_varidx_extract_v4s32(<4 x i32> %x, i32 %idx) { ; CHECK-GISEL-NEXT: sub sp, sp, #16 ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 -; CHECK-GISEL-NEXT: mov s1, v0.s[1] ; CHECK-GISEL-NEXT: mov x8, sp ; CHECK-GISEL-NEXT: str q0, [sp] ; CHECK-GISEL-NEXT: and x9, x9, #0x3 -; CHECK-GISEL-NEXT: ldr s0, [x8, x9, lsl #2] -; CHECK-GISEL-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: ldr s1, [x8, x9, lsl #2] +; CHECK-GISEL-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GISEL-NEXT: fmov d0, d1 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret %tmp = extractelement <4 x i32> %x, i32 %idx diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index 720951eca6a34..0412aef7545e9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -13820,12 +13820,10 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; CHECK-GI-LABEL: test_ld1lane_build: ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: ldr s1, [x1] -; CHECK-GI-NEXT: ldr s2, [x2] -; CHECK-GI-NEXT: ldr s3, [x3] -; CHECK-GI-NEXT: mov.s v0[1], v1[0] -; CHECK-GI-NEXT: mov.s v2[1], v3[0] -; CHECK-GI-NEXT: sub.2s v0, v0, v2 +; CHECK-GI-NEXT: ldr s1, [x2] +; CHECK-GI-NEXT: ld1.s { v0 }[1], [x1] +; CHECK-GI-NEXT: ld1.s { v1 }[1], [x3] +; CHECK-GI-NEXT: sub.2s v0, v0, v1 ; CHECK-GI-NEXT: str d0, [x4] ; CHECK-GI-NEXT: ret %load0 = load i32, ptr %ptr0, align 4 @@ -13844,28 +13842,15 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr } define void @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %e, ptr %p) { -; CHECK-SD-LABEL: test_ld1lane_build_i16: -; CHECK-SD: ; %bb.0: -; CHECK-SD-NEXT: ldr h1, [x0] -; CHECK-SD-NEXT: ld1.h { v1 }[1], [x1] -; CHECK-SD-NEXT: ld1.h { v1 }[2], [x2] -; CHECK-SD-NEXT: ld1.h { v1 }[3], [x3] -; CHECK-SD-NEXT: sub.4h v0, v1, v0 -; CHECK-SD-NEXT: str d0, [x4] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_ld1lane_build_i16: -; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: ldr h1, [x0] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: mov.h v1[1], v2[0] -; CHECK-GI-NEXT: ldr h2, [x2] -; CHECK-GI-NEXT: mov.h v1[2], v2[0] -; CHECK-GI-NEXT: ldr h2, [x3] -; CHECK-GI-NEXT: mov.h v1[3], v2[0] -; CHECK-GI-NEXT: sub.4h v0, v1, v0 -; CHECK-GI-NEXT: str d0, [x4] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_ld1lane_build_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: ld1.h { v1 }[1], [x1] +; CHECK-NEXT: ld1.h { v1 }[2], [x2] +; CHECK-NEXT: ld1.h { v1 }[3], [x3] +; CHECK-NEXT: sub.4h v0, v1, v0 +; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: ret %ld.a = load i16, ptr %a %ld.b = load i16, ptr %b %ld.c = load i16, ptr %c @@ -13880,34 +13865,18 @@ define void @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> % } define void @test_ld1lane_build_half(ptr %a, ptr %b, ptr %c, ptr %d, <4 x half> %e, ptr %p) { -; CHECK-SD-LABEL: test_ld1lane_build_half: -; CHECK-SD: ; %bb.0: -; CHECK-SD-NEXT: ldr h1, [x0] -; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h -; CHECK-SD-NEXT: ld1.h { v1 }[1], [x1] -; CHECK-SD-NEXT: ld1.h { v1 }[2], [x2] -; CHECK-SD-NEXT: ld1.h { v1 }[3], [x3] -; CHECK-SD-NEXT: fcvtl v1.4s, v1.4h -; CHECK-SD-NEXT: fsub.4s v0, v1, v0 -; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s -; CHECK-SD-NEXT: str d0, [x4] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_ld1lane_build_half: -; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: ldr h1, [x0] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h -; CHECK-GI-NEXT: mov.h v1[1], v2[0] -; CHECK-GI-NEXT: ldr h2, [x2] -; CHECK-GI-NEXT: mov.h v1[2], v2[0] -; CHECK-GI-NEXT: ldr h2, [x3] -; CHECK-GI-NEXT: mov.h v1[3], v2[0] -; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NEXT: fsub.4s v0, v1, v0 -; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s -; CHECK-GI-NEXT: str d0, [x4] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_ld1lane_build_half: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NEXT: ld1.h { v1 }[1], [x1] +; CHECK-NEXT: ld1.h { v1 }[2], [x2] +; CHECK-NEXT: ld1.h { v1 }[3], [x3] +; CHECK-NEXT: fcvtl v1.4s, v1.4h +; CHECK-NEXT: fsub.4s v0, v1, v0 +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: ret %ld.a = load half, ptr %a %ld.b = load half, ptr %b %ld.c = load half, ptr %c @@ -13942,6 +13911,7 @@ define void @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr ; CHECK-GI-NEXT: ldr b1, [x0] ; CHECK-GI-NEXT: ldr b2, [x1] ; CHECK-GI-NEXT: ldr x8, [sp] +; CHECK-GI-NEXT: mov.b v1[0], v1[0] ; CHECK-GI-NEXT: mov.b v1[1], v2[0] ; CHECK-GI-NEXT: ldr b2, [x2] ; CHECK-GI-NEXT: mov.b v1[2], v2[0] diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index c56f4409e3a62..c0d91c1e0c836 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1259,7 +1259,7 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) { ; ; CHECK-GI-LABEL: scalar_to_vector.v2i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov v0.s[0], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %b = insertelement <2 x i32> undef, i32 %a, i32 0 @@ -1267,19 +1267,29 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) { } define <4 x i32> @scalar_to_vector.v4i32(i32 %a) { -; CHECK-LABEL: scalar_to_vector.v4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_to_vector.v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_to_vector.v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: ret %b = insertelement <4 x i32> undef, i32 %a, i32 0 ret <4 x i32> %b } define <2 x i64> @scalar_to_vector.v2i64(i64 %a) { -; CHECK-LABEL: scalar_to_vector.v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_to_vector.v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_to_vector.v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v0.d[0], x0 +; CHECK-GI-NEXT: ret %b = insertelement <2 x i64> undef, i64 %a, i32 0 ret <2 x i64> %b } @@ -1348,21 +1358,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; ; CHECK-GI-LABEL: getl: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov b4, v0.b[4] -; CHECK-GI-NEXT: mov b5, v0.b[5] -; CHECK-GI-NEXT: mov b6, v0.b[6] -; CHECK-GI-NEXT: mov b7, v0.b[7] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[4], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[5], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[6], v6.b[0] -; CHECK-GI-NEXT: mov v0.b[7], v7.b[0] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov v1.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[3] +; CHECK-GI-NEXT: mov v1.b[2], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[4] +; CHECK-GI-NEXT: mov v1.b[3], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[5] +; CHECK-GI-NEXT: mov v1.b[4], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[6] +; CHECK-GI-NEXT: mov b0, v0.b[7] +; CHECK-GI-NEXT: mov v1.b[5], v2.b[0] +; CHECK-GI-NEXT: mov v1.b[6], v3.b[0] +; CHECK-GI-NEXT: mov v1.b[7], v0.b[0] +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: ret %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 @@ -1405,16 +1416,13 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: mov w9, w0 -; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov x8, sp ; CHECK-GI-NEXT: str q0, [sp] ; CHECK-GI-NEXT: and x9, x9, #0x7 -; CHECK-GI-NEXT: mov h3, v0.h[2] ; CHECK-GI-NEXT: ldr h1, [x8, x9, lsl #1] -; CHECK-GI-NEXT: mov h0, v0.h[3] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] ; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #16 ; CHECK-GI-NEXT: ret @@ -1709,8 +1717,8 @@ define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) { ; ; CHECK-GI-LABEL: test_concat_undef_v1i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v0.s[1], v0.s[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -1794,25 +1802,26 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov v2.16b, v1.16b +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov b3, v0.b[1] ; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v0.b[2] -; CHECK-GI-NEXT: mov b4, v0.b[3] -; CHECK-GI-NEXT: mov b5, v0.b[4] -; CHECK-GI-NEXT: mov b6, v0.b[5] -; CHECK-GI-NEXT: mov b7, v0.b[6] -; CHECK-GI-NEXT: mov b16, v0.b[7] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0] -; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] -; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] -; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v1.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: mov v1.b[2], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[4] +; CHECK-GI-NEXT: mov v1.b[3], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[5] +; CHECK-GI-NEXT: mov v1.b[4], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[6] +; CHECK-GI-NEXT: mov b0, v0.b[7] +; CHECK-GI-NEXT: mov v1.b[5], v3.b[0] +; CHECK-GI-NEXT: mov v1.b[6], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[7], v0.b[0] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI127_0] +; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1844,36 +1853,38 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b3, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov b4, v0.b[3] -; CHECK-GI-NEXT: mov b5, v0.b[4] -; CHECK-GI-NEXT: mov b6, v0.b[5] -; CHECK-GI-NEXT: mov b7, v0.b[6] -; CHECK-GI-NEXT: mov b16, v0.b[7] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[1] -; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov v2.b[1], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[2], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[4] +; CHECK-GI-NEXT: mov v2.b[3], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[5] +; CHECK-GI-NEXT: mov v2.b[4], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[6] +; CHECK-GI-NEXT: mov b0, v0.b[7] +; CHECK-GI-NEXT: mov v2.b[5], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[2] -; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] -; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] -; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[9], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[6], v4.b[0] +; CHECK-GI-NEXT: mov v2.b[7], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[1] +; CHECK-GI-NEXT: mov v2.b[8], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[9], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[3] +; CHECK-GI-NEXT: mov v2.b[10], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[4] -; CHECK-GI-NEXT: mov v0.b[11], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[5] -; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[11], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[5] +; CHECK-GI-NEXT: mov v2.b[12], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[6] -; CHECK-GI-NEXT: mov b1, v1.b[7] -; CHECK-GI-NEXT: mov v0.b[13], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[15], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[13], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[7] +; CHECK-GI-NEXT: mov v2.b[14], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[15], v0.b[0] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 @@ -1922,36 +1933,38 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b3, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov b4, v0.b[3] -; CHECK-GI-NEXT: mov b5, v0.b[4] -; CHECK-GI-NEXT: mov b6, v0.b[5] -; CHECK-GI-NEXT: mov b7, v0.b[6] -; CHECK-GI-NEXT: mov b16, v0.b[7] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[1] -; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov v2.b[1], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[2], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[4] +; CHECK-GI-NEXT: mov v2.b[3], v3.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[5] +; CHECK-GI-NEXT: mov v2.b[4], v4.b[0] +; CHECK-GI-NEXT: mov b4, v0.b[6] +; CHECK-GI-NEXT: mov b0, v0.b[7] +; CHECK-GI-NEXT: mov v2.b[5], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[2] -; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] -; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] -; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[9], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[6], v4.b[0] +; CHECK-GI-NEXT: mov v2.b[7], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[1] +; CHECK-GI-NEXT: mov v2.b[8], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[9], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[3] +; CHECK-GI-NEXT: mov v2.b[10], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[4] -; CHECK-GI-NEXT: mov v0.b[11], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[5] -; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[11], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[5] +; CHECK-GI-NEXT: mov v2.b[12], v3.b[0] ; CHECK-GI-NEXT: mov b3, v1.b[6] -; CHECK-GI-NEXT: mov b1, v1.b[7] -; CHECK-GI-NEXT: mov v0.b[13], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[15], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[13], v0.b[0] +; CHECK-GI-NEXT: mov b0, v1.b[7] +; CHECK-GI-NEXT: mov v2.b[14], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[15], v0.b[0] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -2017,17 +2030,15 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov v2.16b, v1.16b +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v0.h[2] -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0] -; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI131_0] +; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2051,20 +2062,16 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v0.h[2] +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] -; CHECK-GI-NEXT: mov h3, v1.h[2] -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NEXT: mov h1, v1.h[3] -; CHECK-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[6], v3.h[0] -; CHECK-GI-NEXT: mov v0.h[7], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v2.h[3], v0.h[3] +; CHECK-GI-NEXT: mov v2.h[4], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[5], v1.h[1] +; CHECK-GI-NEXT: mov v2.h[6], v1.h[2] +; CHECK-GI-NEXT: mov v2.h[7], v1.h[3] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -2097,20 +2104,16 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v0.h[2] +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] -; CHECK-GI-NEXT: mov h3, v1.h[2] -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NEXT: mov h1, v1.h[3] -; CHECK-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[6], v3.h[0] -; CHECK-GI-NEXT: mov v0.h[7], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v2.h[3], v0.h[3] +; CHECK-GI-NEXT: mov v2.h[4], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[5], v1.h[1] +; CHECK-GI-NEXT: mov v2.h[6], v1.h[2] +; CHECK-GI-NEXT: mov v2.h[7], v1.h[3] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2160,13 +2163,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov v2.16b, v1.16b +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI135_0] +; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 @@ -2186,12 +2189,12 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: mov s2, v1.s[1] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: mov v0.s[3], v2.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[3], v1.s[1] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 @@ -2241,11 +2244,18 @@ entry: } define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { -; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_concat_v2i64_v2i64_v1i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.d[0], v0.d[0] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i64> %x, i32 0 %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll index 1f5654d59926d..a6a825b26b3b5 100644 --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -466,92 +466,62 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) { ; ; CHECK-GI-LABEL: sext_v32i1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr w9, [sp, #64] -; CHECK-GI-NEXT: ldr w8, [sp, #72] +; CHECK-GI-NEXT: ldr w8, [sp, #64] ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s2, w1 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w9, [sp, #72] +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: ldr w8, [sp, #80] +; CHECK-GI-NEXT: mov.b v0[1], w1 +; CHECK-GI-NEXT: mov.b v1[1], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #128] -; CHECK-GI-NEXT: mov.b v0[1], v2[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: mov.b v1[1], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[2], w2 +; CHECK-GI-NEXT: mov.b v1[2], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #88] -; CHECK-GI-NEXT: mov.b v0[2], v2[0] -; CHECK-GI-NEXT: fmov s2, w3 -; CHECK-GI-NEXT: mov.b v1[2], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[3], w3 +; CHECK-GI-NEXT: mov.b v1[3], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #96] -; CHECK-GI-NEXT: mov.b v0[3], v2[0] -; CHECK-GI-NEXT: fmov s2, w4 -; CHECK-GI-NEXT: mov.b v1[3], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[4], w4 +; CHECK-GI-NEXT: mov.b v1[4], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #104] -; CHECK-GI-NEXT: mov.b v0[4], v2[0] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: mov.b v1[4], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[5], w5 +; CHECK-GI-NEXT: mov.b v1[5], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #112] -; CHECK-GI-NEXT: mov.b v0[5], v2[0] -; CHECK-GI-NEXT: fmov s2, w6 -; CHECK-GI-NEXT: mov.b v1[5], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[6], w6 +; CHECK-GI-NEXT: mov.b v1[6], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #120] -; CHECK-GI-NEXT: mov.b v0[6], v2[0] -; CHECK-GI-NEXT: fmov s2, w7 -; CHECK-GI-NEXT: mov.b v1[6], v3[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov.b v0[7], w7 +; CHECK-GI-NEXT: mov.b v1[7], w8 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: mov.b v0[7], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[8], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #8] -; CHECK-GI-NEXT: mov.b v1[7], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[8], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #136] -; CHECK-GI-NEXT: mov.b v0[8], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[9], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov.b v1[8], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[9], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #144] -; CHECK-GI-NEXT: mov.b v0[9], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[10], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov.b v1[9], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[10], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #152] -; CHECK-GI-NEXT: mov.b v0[10], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[11], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #32] -; CHECK-GI-NEXT: mov.b v1[10], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[11], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #160] -; CHECK-GI-NEXT: mov.b v0[11], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[12], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #40] -; CHECK-GI-NEXT: mov.b v1[11], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[12], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #168] -; CHECK-GI-NEXT: mov.b v0[12], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[13], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #48] -; CHECK-GI-NEXT: mov.b v1[12], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[13], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #176] -; CHECK-GI-NEXT: mov.b v0[13], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v0[14], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #56] -; CHECK-GI-NEXT: mov.b v1[13], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v1[14], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #184] -; CHECK-GI-NEXT: mov.b v0[14], v2[0] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: mov.b v1[14], v3[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov.b v0[15], v2[0] -; CHECK-GI-NEXT: mov.b v1[15], v3[0] +; CHECK-GI-NEXT: mov.b v0[15], w8 +; CHECK-GI-NEXT: mov.b v1[15], w9 ; CHECK-GI-NEXT: shl.16b v0, v0, #7 ; CHECK-GI-NEXT: shl.16b v1, v1, #7 ; CHECK-GI-NEXT: sshr.16b v0, v0, #7 @@ -840,194 +810,134 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) { ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w29, -16 -; CHECK-GI-NEXT: ldr w9, [sp, #80] -; CHECK-GI-NEXT: ldr w11, [sp, #88] +; CHECK-GI-NEXT: ldr w13, [sp, #80] +; CHECK-GI-NEXT: ldr w11, [sp, #208] ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s3, w1 -; CHECK-GI-NEXT: ldr w8, [sp, #208] -; CHECK-GI-NEXT: ldr w10, [sp, #216] -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s4, w11 ; CHECK-GI-NEXT: ldr w9, [sp, #336] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: fmov s5, w10 -; CHECK-GI-NEXT: ldr w11, [sp, #344] -; CHECK-GI-NEXT: mov.b v0[1], v3[0] +; CHECK-GI-NEXT: ldr w8, [sp, #88] +; CHECK-GI-NEXT: ldr w10, [sp, #216] +; CHECK-GI-NEXT: fmov s1, w13 +; CHECK-GI-NEXT: fmov s2, w11 +; CHECK-GI-NEXT: ldr w12, [sp, #344] ; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v0[1], w1 ; CHECK-GI-NEXT: ldr w9, [sp, #224] -; CHECK-GI-NEXT: mov.b v1[1], v4[0] -; CHECK-GI-NEXT: fmov s4, w2 -; CHECK-GI-NEXT: fmov s6, w11 -; CHECK-GI-NEXT: mov.b v2[1], v5[0] +; CHECK-GI-NEXT: ldr w11, [sp, #400] +; CHECK-GI-NEXT: mov.b v1[1], w8 +; CHECK-GI-NEXT: mov.b v2[1], w10 ; CHECK-GI-NEXT: ldr w8, [sp, #96] +; CHECK-GI-NEXT: mov.b v3[1], w12 ; CHECK-GI-NEXT: ldr w10, [sp, #352] -; CHECK-GI-NEXT: ldr w11, [sp, #16] -; CHECK-GI-NEXT: mov.b v0[2], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #232] -; CHECK-GI-NEXT: mov.b v3[1], v6[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: mov.b v0[2], w2 +; CHECK-GI-NEXT: mov.b v1[2], w8 +; CHECK-GI-NEXT: mov.b v2[2], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #104] +; CHECK-GI-NEXT: mov.b v3[2], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #232] ; CHECK-GI-NEXT: ldr w10, [sp, #360] -; CHECK-GI-NEXT: mov.b v2[2], v4[0] -; CHECK-GI-NEXT: fmov s4, w3 -; CHECK-GI-NEXT: mov.b v1[2], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov.b v0[3], w3 +; CHECK-GI-NEXT: mov.b v1[3], w8 +; CHECK-GI-NEXT: mov.b v2[3], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #112] -; CHECK-GI-NEXT: mov.b v3[2], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #368] -; CHECK-GI-NEXT: mov.b v0[3], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: mov.b v3[3], w10 ; CHECK-GI-NEXT: ldr w9, [sp, #240] -; CHECK-GI-NEXT: mov.b v1[3], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w10, [sp, #368] +; CHECK-GI-NEXT: mov.b v0[4], w4 +; CHECK-GI-NEXT: mov.b v1[4], w8 +; CHECK-GI-NEXT: mov.b v2[4], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #120] -; CHECK-GI-NEXT: mov.b v2[3], v4[0] -; CHECK-GI-NEXT: fmov s4, w4 -; CHECK-GI-NEXT: mov.b v3[3], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #376] -; CHECK-GI-NEXT: mov.b v0[4], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: mov.b v3[4], w10 ; CHECK-GI-NEXT: ldr w9, [sp, #248] -; CHECK-GI-NEXT: mov.b v1[4], v5[0] -; CHECK-GI-NEXT: mov.b v3[4], v6[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #376] +; CHECK-GI-NEXT: mov.b v0[5], w5 +; CHECK-GI-NEXT: mov.b v1[5], w8 +; CHECK-GI-NEXT: mov.b v2[5], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #128] -; CHECK-GI-NEXT: ldr w10, [sp, #384] -; CHECK-GI-NEXT: mov.b v2[4], v4[0] -; CHECK-GI-NEXT: fmov s4, w5 -; CHECK-GI-NEXT: mov.b v1[5], v5[0] -; CHECK-GI-NEXT: mov.b v3[5], v6[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: mov.b v0[5], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: mov.b v3[5], w10 ; CHECK-GI-NEXT: ldr w9, [sp, #256] -; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #384] +; CHECK-GI-NEXT: mov.b v0[6], w6 +; CHECK-GI-NEXT: mov.b v1[6], w8 +; CHECK-GI-NEXT: mov.b v2[6], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #136] -; CHECK-GI-NEXT: ldr w10, [sp, #392] -; CHECK-GI-NEXT: mov.b v2[5], v4[0] -; CHECK-GI-NEXT: fmov s4, w6 -; CHECK-GI-NEXT: mov.b v1[6], v5[0] -; CHECK-GI-NEXT: mov.b v3[6], v6[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w8, [sp, #144] -; CHECK-GI-NEXT: ldr w10, [sp, #400] -; CHECK-GI-NEXT: mov.b v0[6], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: mov.b v3[6], w10 ; CHECK-GI-NEXT: ldr w9, [sp, #264] -; CHECK-GI-NEXT: mov.b v1[7], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #152] -; CHECK-GI-NEXT: mov.b v3[7], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #408] -; CHECK-GI-NEXT: mov.b v2[6], v4[0] -; CHECK-GI-NEXT: fmov s4, w7 -; CHECK-GI-NEXT: mov.b v1[8], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #160] -; CHECK-GI-NEXT: mov.b v0[7], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #272] -; CHECK-GI-NEXT: mov.b v3[8], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #416] -; CHECK-GI-NEXT: mov.b v2[7], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #24] -; CHECK-GI-NEXT: mov.b v1[9], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #168] -; CHECK-GI-NEXT: mov.b v3[9], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #424] -; CHECK-GI-NEXT: mov.b v0[8], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #280] -; CHECK-GI-NEXT: mov.b v1[10], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #176] -; CHECK-GI-NEXT: mov.b v2[8], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #32] -; CHECK-GI-NEXT: mov.b v3[10], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #432] -; CHECK-GI-NEXT: mov.b v0[9], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #288] -; CHECK-GI-NEXT: mov.b v1[11], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #184] -; CHECK-GI-NEXT: mov.b v3[11], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #440] -; CHECK-GI-NEXT: mov.b v2[9], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: mov.b v1[12], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #192] -; CHECK-GI-NEXT: mov.b v0[10], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #296] -; CHECK-GI-NEXT: mov.b v3[12], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #448] -; CHECK-GI-NEXT: mov.b v2[10], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #48] -; CHECK-GI-NEXT: mov.b v1[13], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #200] -; CHECK-GI-NEXT: mov.b v3[13], v6[0] -; CHECK-GI-NEXT: fmov s6, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #456] -; CHECK-GI-NEXT: mov.b v0[11], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #304] -; CHECK-GI-NEXT: fmov s7, w10 -; CHECK-GI-NEXT: mov.b v1[14], v5[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: mov.b v2[11], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #56] -; CHECK-GI-NEXT: mov.b v3[14], v6[0] -; CHECK-GI-NEXT: mov.b v0[12], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #312] -; CHECK-GI-NEXT: mov.b v1[15], v5[0] -; CHECK-GI-NEXT: mov.b v3[15], v7[0] -; CHECK-GI-NEXT: mov.b v2[12], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #64] -; CHECK-GI-NEXT: shl.16b v1, v1, #7 -; CHECK-GI-NEXT: mov.b v0[13], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #320] -; CHECK-GI-NEXT: shl.16b v3, v3, #7 -; CHECK-GI-NEXT: sshr.16b v1, v1, #7 -; CHECK-GI-NEXT: mov.b v2[13], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #72] -; CHECK-GI-NEXT: sshr.16b v3, v3, #7 -; CHECK-GI-NEXT: mov.b v0[14], v4[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #328] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: mov.b v2[14], v4[0] -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: mov.b v0[15], v4[0] -; CHECK-GI-NEXT: mov.b v2[15], v6[0] +; CHECK-GI-NEXT: ldr w10, [sp, #392] +; CHECK-GI-NEXT: mov.b v0[7], w7 +; CHECK-GI-NEXT: mov.b v1[7], w8 +; CHECK-GI-NEXT: mov.b v2[7], w9 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov.b v3[7], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #144] +; CHECK-GI-NEXT: ldr w10, [sp, #272] +; CHECK-GI-NEXT: mov.b v0[8], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov.b v1[8], w9 +; CHECK-GI-NEXT: mov.b v2[8], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #152] +; CHECK-GI-NEXT: mov.b v3[8], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #280] +; CHECK-GI-NEXT: ldr w11, [sp, #408] +; CHECK-GI-NEXT: mov.b v0[9], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #32] +; CHECK-GI-NEXT: mov.b v1[9], w9 +; CHECK-GI-NEXT: mov.b v2[9], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #160] +; CHECK-GI-NEXT: mov.b v3[9], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #288] +; CHECK-GI-NEXT: ldr w11, [sp, #416] +; CHECK-GI-NEXT: mov.b v0[10], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #40] +; CHECK-GI-NEXT: mov.b v1[10], w9 +; CHECK-GI-NEXT: mov.b v2[10], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #168] +; CHECK-GI-NEXT: mov.b v3[10], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #296] +; CHECK-GI-NEXT: ldr w11, [sp, #424] +; CHECK-GI-NEXT: mov.b v0[11], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #48] +; CHECK-GI-NEXT: mov.b v1[11], w9 +; CHECK-GI-NEXT: mov.b v2[11], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #176] +; CHECK-GI-NEXT: mov.b v3[11], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #304] +; CHECK-GI-NEXT: ldr w11, [sp, #432] +; CHECK-GI-NEXT: mov.b v0[12], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov.b v1[12], w9 +; CHECK-GI-NEXT: mov.b v2[12], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #184] +; CHECK-GI-NEXT: mov.b v3[12], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #312] +; CHECK-GI-NEXT: ldr w11, [sp, #440] +; CHECK-GI-NEXT: mov.b v0[13], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #64] +; CHECK-GI-NEXT: mov.b v1[13], w9 +; CHECK-GI-NEXT: mov.b v2[13], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #192] +; CHECK-GI-NEXT: mov.b v3[13], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #320] +; CHECK-GI-NEXT: ldr w11, [sp, #448] +; CHECK-GI-NEXT: mov.b v0[14], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #72] +; CHECK-GI-NEXT: mov.b v1[14], w9 +; CHECK-GI-NEXT: mov.b v2[14], w10 +; CHECK-GI-NEXT: ldr w9, [sp, #200] +; CHECK-GI-NEXT: mov.b v3[14], w11 +; CHECK-GI-NEXT: ldr w10, [sp, #328] +; CHECK-GI-NEXT: ldr w11, [sp, #456] +; CHECK-GI-NEXT: mov.b v0[15], w8 +; CHECK-GI-NEXT: mov.b v1[15], w9 +; CHECK-GI-NEXT: mov.b v2[15], w10 +; CHECK-GI-NEXT: mov.b v3[15], w11 ; CHECK-GI-NEXT: shl.16b v0, v0, #7 +; CHECK-GI-NEXT: shl.16b v1, v1, #7 ; CHECK-GI-NEXT: shl.16b v2, v2, #7 +; CHECK-GI-NEXT: shl.16b v3, v3, #7 ; CHECK-GI-NEXT: sshr.16b v0, v0, #7 +; CHECK-GI-NEXT: sshr.16b v1, v1, #7 ; CHECK-GI-NEXT: sshr.16b v2, v2, #7 +; CHECK-GI-NEXT: sshr.16b v3, v3, #7 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret %res = sext <64 x i1> %arg to <64 x i8> diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll index 44b92e6ccd088..a854cb7fec991 100644 --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -368,28 +368,26 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x ; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-GI-NEXT: mov.16b v5, v4 -; CHECK-GI-NEXT: mov.b v5[1], v4[0] -; CHECK-GI-NEXT: mov.b v5[2], v4[0] -; CHECK-GI-NEXT: mov.b v5[3], v4[0] -; CHECK-GI-NEXT: mov.b v5[4], v4[0] -; CHECK-GI-NEXT: mov.b v5[5], v4[0] -; CHECK-GI-NEXT: mov.b v5[6], v4[0] -; CHECK-GI-NEXT: mov.b v5[7], v4[0] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov.b v4[1], w0 +; CHECK-GI-NEXT: mov.b v4[2], w0 +; CHECK-GI-NEXT: mov.b v4[3], w0 +; CHECK-GI-NEXT: mov.b v4[4], w0 +; CHECK-GI-NEXT: mov.b v4[5], w0 +; CHECK-GI-NEXT: mov.b v4[6], w0 +; CHECK-GI-NEXT: mov.b v4[7], w0 +; CHECK-GI-NEXT: mov.b v4[8], w8 +; CHECK-GI-NEXT: mov.b v4[9], w8 +; CHECK-GI-NEXT: mov.b v4[10], w8 +; CHECK-GI-NEXT: mov.b v4[11], w8 +; CHECK-GI-NEXT: mov.b v4[12], w8 +; CHECK-GI-NEXT: mov.b v4[13], w8 +; CHECK-GI-NEXT: mov.b v4[14], w8 +; CHECK-GI-NEXT: mov.b v4[15], w8 ; CHECK-GI-NEXT: adrp x8, .LCPI10_1 -; CHECK-GI-NEXT: mov.b v5[8], v4[0] -; CHECK-GI-NEXT: mov.b v5[9], v4[0] -; CHECK-GI-NEXT: mov.b v5[10], v4[0] -; CHECK-GI-NEXT: mov.b v5[11], v4[0] -; CHECK-GI-NEXT: mov.b v5[12], v4[0] -; CHECK-GI-NEXT: mov.b v5[13], v4[0] -; CHECK-GI-NEXT: mov.b v5[14], v4[0] -; CHECK-GI-NEXT: mov.b v5[15], v4[0] -; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI10_1] +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI10_1] ; CHECK-GI-NEXT: adrp x8, .LCPI10_0 -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 -; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 ; CHECK-GI-NEXT: ret @@ -488,35 +486,32 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x ; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #1 // =0x1 -; CHECK-GI-NEXT: fmov s6, w0 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: mov w8, #255 // =0xff ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-GI-NEXT: mov.16b v5, v4 -; CHECK-GI-NEXT: mov.b v5[1], v4[0] -; CHECK-GI-NEXT: mov.b v5[2], v4[0] -; CHECK-GI-NEXT: mov.b v5[3], v4[0] -; CHECK-GI-NEXT: mov.b v5[4], v4[0] -; CHECK-GI-NEXT: mov.b v5[5], v4[0] -; CHECK-GI-NEXT: mov.b v5[6], v4[0] -; CHECK-GI-NEXT: mov.b v5[7], v4[0] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov.b v4[1], w8 +; CHECK-GI-NEXT: mov.b v4[2], w8 +; CHECK-GI-NEXT: mov.b v4[3], w8 +; CHECK-GI-NEXT: mov.b v4[4], w8 +; CHECK-GI-NEXT: mov.b v4[5], w8 +; CHECK-GI-NEXT: mov.b v4[6], w8 +; CHECK-GI-NEXT: mov.b v4[7], w8 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: mov.b v4[8], w8 +; CHECK-GI-NEXT: mov.b v4[9], w8 +; CHECK-GI-NEXT: mov.b v4[10], w8 +; CHECK-GI-NEXT: mov.b v4[11], w8 +; CHECK-GI-NEXT: mov.b v4[12], w0 +; CHECK-GI-NEXT: mov.b v4[13], w0 +; CHECK-GI-NEXT: mov.b v4[14], w8 ; CHECK-GI-NEXT: adrp x8, .LCPI11_1 -; CHECK-GI-NEXT: mov.b v5[8], v4[0] -; CHECK-GI-NEXT: mov.b v5[9], v4[0] -; CHECK-GI-NEXT: mov.b v5[10], v4[0] -; CHECK-GI-NEXT: mov.b v5[11], v4[0] -; CHECK-GI-NEXT: mov.b v5[12], v6[0] -; CHECK-GI-NEXT: mov.b v5[13], v6[0] -; CHECK-GI-NEXT: mov.b v5[14], v4[0] -; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI11_1] +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI11_1] ; CHECK-GI-NEXT: adrp x8, .LCPI11_0 -; CHECK-GI-NEXT: mov.b v5[15], v6[0] -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 -; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: mov.b v4[15], w0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 ; CHECK-GI-NEXT: ret @@ -623,32 +618,30 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmov s4, w0 ; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: adrp x9, .LCPI12_1 ; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q5, [x9, :lo12:.LCPI12_1] ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov.16b v5, v4 -; CHECK-GI-NEXT: mov.b v5[1], v4[0] -; CHECK-GI-NEXT: mov.b v5[2], v4[0] -; CHECK-GI-NEXT: mov.b v5[3], v4[0] -; CHECK-GI-NEXT: mov.b v5[4], v4[0] -; CHECK-GI-NEXT: mov.b v5[5], v4[0] -; CHECK-GI-NEXT: mov.b v5[6], v4[0] -; CHECK-GI-NEXT: mov.b v5[7], v4[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: adrp x8, .LCPI12_1 -; CHECK-GI-NEXT: mov.b v5[8], v4[0] -; CHECK-GI-NEXT: mov.b v5[9], v4[0] -; CHECK-GI-NEXT: mov.b v5[10], v4[0] -; CHECK-GI-NEXT: mov.b v5[11], v4[0] -; CHECK-GI-NEXT: mov.b v5[12], v4[0] -; CHECK-GI-NEXT: mov.b v5[13], v4[0] -; CHECK-GI-NEXT: mov.b v5[14], v4[0] -; CHECK-GI-NEXT: mov.b v5[15], v4[0] -; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI12_1] +; CHECK-GI-NEXT: mov.b v4[1], w0 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-GI-NEXT: mov.b v4[2], w0 +; CHECK-GI-NEXT: mov.b v4[3], w0 +; CHECK-GI-NEXT: mov.b v4[4], w0 +; CHECK-GI-NEXT: mov.b v4[5], w0 +; CHECK-GI-NEXT: mov.b v4[6], w0 +; CHECK-GI-NEXT: mov.b v4[7], w0 +; CHECK-GI-NEXT: mov.b v4[8], w8 +; CHECK-GI-NEXT: mov.b v4[9], w8 +; CHECK-GI-NEXT: mov.b v4[10], w8 +; CHECK-GI-NEXT: mov.b v4[11], w8 +; CHECK-GI-NEXT: mov.b v4[12], w8 +; CHECK-GI-NEXT: mov.b v4[13], w8 +; CHECK-GI-NEXT: mov.b v4[14], w8 +; CHECK-GI-NEXT: mov.b v4[15], w8 ; CHECK-GI-NEXT: adrp x8, .LCPI12_0 -; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 -; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v4 ; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI12_0] ; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 ; CHECK-GI-NEXT: ret @@ -774,30 +767,28 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 ; CHECK-GI-NEXT: mov w8, #255 // =0xff ; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: fmov s6, w8 -; CHECK-GI-NEXT: adrp x8, .LCPI13_1 ; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov.16b v5, v4 -; CHECK-GI-NEXT: mov.b v5[1], v4[0] -; CHECK-GI-NEXT: mov.b v5[2], v4[0] -; CHECK-GI-NEXT: mov.b v5[3], v4[0] -; CHECK-GI-NEXT: mov.b v5[4], v4[0] -; CHECK-GI-NEXT: mov.b v5[5], v4[0] -; CHECK-GI-NEXT: mov.b v5[6], v4[0] -; CHECK-GI-NEXT: mov.b v5[7], v4[0] -; CHECK-GI-NEXT: mov.b v5[8], v6[0] -; CHECK-GI-NEXT: mov.b v5[9], v6[0] -; CHECK-GI-NEXT: mov.b v5[10], v6[0] -; CHECK-GI-NEXT: mov.b v5[11], v6[0] -; CHECK-GI-NEXT: mov.b v5[12], v6[0] -; CHECK-GI-NEXT: mov.b v5[13], v6[0] -; CHECK-GI-NEXT: mov.b v5[14], v4[0] -; CHECK-GI-NEXT: mov.b v5[15], v4[0] -; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI13_1] +; CHECK-GI-NEXT: mov.b v4[1], w0 +; CHECK-GI-NEXT: mov.b v4[2], w0 +; CHECK-GI-NEXT: mov.b v4[3], w0 +; CHECK-GI-NEXT: mov.b v4[4], w0 +; CHECK-GI-NEXT: mov.b v4[5], w0 +; CHECK-GI-NEXT: mov.b v4[6], w0 +; CHECK-GI-NEXT: mov.b v4[7], w0 +; CHECK-GI-NEXT: mov.b v4[8], w8 +; CHECK-GI-NEXT: mov.b v4[9], w8 +; CHECK-GI-NEXT: mov.b v4[10], w8 +; CHECK-GI-NEXT: mov.b v4[11], w8 +; CHECK-GI-NEXT: mov.b v4[12], w8 +; CHECK-GI-NEXT: mov.b v4[13], w8 +; CHECK-GI-NEXT: adrp x8, .LCPI13_1 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI13_1] ; CHECK-GI-NEXT: adrp x8, .LCPI13_0 -; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 -; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-GI-NEXT: mov.b v4[14], w0 +; CHECK-GI-NEXT: mov.b v4[15], w0 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v4 ; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI13_0] ; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 5de99586f7fc7..79cfeedb74bce 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -13,7 +13,7 @@ define <4 x i16> @foo1(<2 x i32> %a) { ; CHECK-GI-LABEL: foo1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #58712 // =0xe558 -; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-GI-NEXT: rev32 v0.4h, v0.4h ; CHECK-GI-NEXT: ret @@ -33,7 +33,7 @@ define <4 x i16> @foo2(<2 x i32> %a) { ; CHECK-GI-LABEL: foo2: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #712 // =0x2c8 -; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-GI-NEXT: rev32 v0.4h, v0.4h ; CHECK-GI-NEXT: ret @@ -60,13 +60,11 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-LABEL: bitcast_v4i8_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = add <4 x i8> %a, %b @@ -87,12 +85,13 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ ; CHECK-GI-NEXT: add w8, w0, w1 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add i32 %a, %b @@ -117,9 +116,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-LABEL: bitcast_v2i16_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: xtn v0.4h, v1.4s ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = add <2 x i16> %a, %b @@ -419,16 +418,17 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-LABEL: bitcast_v2i16_v4i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: xtn v0.4h, v1.4s ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add <2 x i16> %a, %b @@ -455,13 +455,11 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-LABEL: bitcast_v4i8_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 @@ -515,10 +513,12 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){ ; ; CHECK-GI-LABEL: bitcast_v8i32_v4i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mov x8, v2.d[1] +; CHECK-GI-NEXT: mov x9, v3.d[1] +; CHECK-GI-NEXT: mov v0.d[0], v2.d[0] +; CHECK-GI-NEXT: mov v1.d[0], v3.d[0] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: ret @@ -574,10 +574,12 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){ ; ; CHECK-GI-LABEL: bitcast_v16i16_v4i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: add v2.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: add v3.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: mov x8, v2.d[1] +; CHECK-GI-NEXT: mov x9, v3.d[1] +; CHECK-GI-NEXT: mov v0.d[0], v2.d[0] +; CHECK-GI-NEXT: mov v1.d[0], v3.d[0] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: ret @@ -614,14 +616,18 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){ ; ; CHECK-GI-LABEL: bitcast_v16i32_v8i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s -; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mov x10, v2.d[1] -; CHECK-GI-NEXT: mov x11, v3.d[1] +; CHECK-GI-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: add v5.4s, v1.4s, v5.4s +; CHECK-GI-NEXT: add v6.4s, v2.4s, v6.4s +; CHECK-GI-NEXT: add v7.4s, v3.4s, v7.4s +; CHECK-GI-NEXT: mov x8, v4.d[1] +; CHECK-GI-NEXT: mov x9, v5.d[1] +; CHECK-GI-NEXT: mov x10, v6.d[1] +; CHECK-GI-NEXT: mov x11, v7.d[1] +; CHECK-GI-NEXT: mov v0.d[0], v4.d[0] +; CHECK-GI-NEXT: mov v1.d[0], v5.d[0] +; CHECK-GI-NEXT: mov v2.d[0], v6.d[0] +; CHECK-GI-NEXT: mov v3.d[0], v7.d[0] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: mov v2.d[1], x10 diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index 071613b9cc011..9ee924dd2548a 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -110,8 +110,8 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){ ; CHECK-GI-LABEL: bswap_v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov v0.h[1], w8 ; CHECK-GI-NEXT: rev16 v0.8b, v0.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] @@ -146,7 +146,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){ ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: rev w8, w8 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index f6eeeef4faf7e..18570b2d793ff 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -11,12 +11,13 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) { ; CHECK-GI-LABEL: concat1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov s2, v1.s[1] -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v2.b[0] +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: mov v0.b[3], w9 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -32,22 +33,20 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) { ; ; CHECK-GI-LABEL: concat2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v1.h[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h3, v0.h[1] -; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: mov h5, v1.h[3] -; CHECK-GI-NEXT: mov h6, v0.h[3] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov v0.h[1], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v5.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v6.h[0] -; CHECK-GI-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v3.h[0], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v3.h[1], v1.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v3.h[2], v1.h[2] +; CHECK-GI-NEXT: mov v2.h[3], v0.h[3] +; CHECK-GI-NEXT: mov v3.h[3], v1.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v2.8h +; CHECK-GI-NEXT: xtn v1.8b, v3.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -75,14 +74,16 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) { ; ; CHECK-GI-LABEL: concat4: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s2, v1.s[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s3, v0.s[1] -; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v0.s[1], v3.s[0] -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: xtn v2.4h, v2.4s +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: xtn v1.4h, v0.4s +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -145,8 +146,9 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) { ; ; CHECK-GI-LABEL: concat9: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -181,12 +183,14 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v8s16_v2s16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: ldr h1, [x0] ; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: xtn v2.4h, v0.4s -; CHECK-GI-NEXT: xtn v0.4h, v1.4s +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], w8 @@ -208,9 +212,10 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { ; CHECK-GI-NEXT: dup v0.8h, w8 ; CHECK-GI-NEXT: xtn v1.8b, v0.8h ; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[3], w8 ; CHECK-GI-NEXT: ret %a = load <4 x i8>, ptr %ptr %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> @@ -218,24 +223,13 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { } define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) { -; CHECK-SD-LABEL: concat_v16s8_v4s8_load: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr s0, [x0] -; CHECK-SD-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-SD-NEXT: ld1 { v0.s }[2], [x2] -; CHECK-SD-NEXT: ld1 { v0.s }[3], [x3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: concat_v16s8_v4s8_load: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: ldr s1, [x1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: ldr s1, [x2] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: ldr s1, [x3] -; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: concat_v16s8_v4s8_load: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1] +; CHECK-NEXT: ld1 { v0.s }[2], [x2] +; CHECK-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB %C = load <4 x i8>, ptr %ptrC @@ -261,41 +255,35 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, < ; ; CHECK-GI-LABEL: concat_v16s8_v4s8_reg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h4, v1.h[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h5, v0.h[1] +; CHECK-GI-NEXT: mov v4.h[0], v0.h[0] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v5.h[0], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: mov h6, v1.h[2] -; CHECK-GI-NEXT: mov h7, v1.h[3] -; CHECK-GI-NEXT: mov h16, v2.h[1] -; CHECK-GI-NEXT: mov h17, v0.h[3] -; CHECK-GI-NEXT: mov h18, v2.h[3] -; CHECK-GI-NEXT: mov v1.h[1], v4.h[0] -; CHECK-GI-NEXT: mov h4, v0.h[2] -; CHECK-GI-NEXT: mov v0.h[1], v5.h[0] -; CHECK-GI-NEXT: mov h5, v2.h[2] -; CHECK-GI-NEXT: mov v2.h[1], v16.h[0] -; CHECK-GI-NEXT: mov v1.h[2], v6.h[0] -; CHECK-GI-NEXT: mov h6, v3.h[1] -; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] -; CHECK-GI-NEXT: mov h4, v3.h[2] -; CHECK-GI-NEXT: mov h5, v3.h[3] -; CHECK-GI-NEXT: mov v1.h[3], v7.h[0] -; CHECK-GI-NEXT: mov v3.h[1], v6.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v17.h[0] -; CHECK-GI-NEXT: mov v2.h[3], v18.h[0] -; CHECK-GI-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h -; CHECK-GI-NEXT: xtn v2.8b, v2.8h -; CHECK-GI-NEXT: mov v3.h[3], v5.h[0] +; CHECK-GI-NEXT: mov v6.h[0], v2.h[0] +; CHECK-GI-NEXT: mov v7.h[0], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v5.h[1], v1.h[1] +; CHECK-GI-NEXT: mov v6.h[1], v2.h[1] +; CHECK-GI-NEXT: mov v7.h[1], v3.h[1] +; CHECK-GI-NEXT: mov v4.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v5.h[2], v1.h[2] +; CHECK-GI-NEXT: mov v6.h[2], v2.h[2] +; CHECK-GI-NEXT: mov v7.h[2], v3.h[2] +; CHECK-GI-NEXT: mov v4.h[3], v0.h[3] +; CHECK-GI-NEXT: mov v5.h[3], v1.h[3] +; CHECK-GI-NEXT: mov v6.h[3], v2.h[3] +; CHECK-GI-NEXT: mov v7.h[3], v3.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v4.8h +; CHECK-GI-NEXT: xtn v1.8b, v5.8h +; CHECK-GI-NEXT: xtn v2.8b, v6.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: xtn v1.8b, v7.8h ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: xtn v1.8b, v3.8h ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[3], w8 @@ -320,27 +308,29 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> % ; ; CHECK-GI-LABEL: concat_v8s16_v2s16_reg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s4, v1.s[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s5, v0.s[1] +; CHECK-GI-NEXT: mov v4.s[0], v0.s[0] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v5.s[0], v1.s[0] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: mov v1.s[1], v4.s[0] -; CHECK-GI-NEXT: mov s4, v2.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] +; CHECK-GI-NEXT: mov v4.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v1.s[0], v2.s[0] +; CHECK-GI-NEXT: xtn v0.4h, v4.4s +; CHECK-GI-NEXT: xtn v4.4h, v5.4s +; CHECK-GI-NEXT: mov v1.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v2.s[0], v3.s[0] +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: mov v2.s[1], v4.s[0] -; CHECK-GI-NEXT: mov s4, v3.s[1] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v2.s[1], v3.s[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: fmov w8, s4 ; CHECK-GI-NEXT: xtn v2.4h, v2.4s -; CHECK-GI-NEXT: mov v3.s[1], v4.s[0] -; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: xtn v1.4h, v3.4s -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov v0.s[3], w8 ; CHECK-GI-NEXT: ret %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll index de108b0bc2b7a..e19e2ead11f4d 100644 --- a/llvm/test/CodeGen/AArch64/fabs.ll +++ b/llvm/test/CodeGen/AArch64/fabs.ll @@ -161,27 +161,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: fabs_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: fabs v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fabs_v7f16: diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll index 6913a62fb266c..b15579199a059 100644 --- a/llvm/test/CodeGen/AArch64/faddsub.ll +++ b/llvm/test/CodeGen/AArch64/faddsub.ll @@ -188,33 +188,25 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] +; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fadd_v7f16: @@ -537,33 +529,25 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fsub v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] +; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v1.4s, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fsub_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index a5d7ae147ffda..8ca1e9ee5b617 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -556,7 +556,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double> ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: cset w19, lt ; CHECK-GI-NEXT: bl __lttf2 -; CHECK-GI-NEXT: fmov d0, x19 +; CHECK-GI-NEXT: mov v0.d[0], x19 ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: cset w8, lt ; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload @@ -663,29 +663,29 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: cset w22, lt ; CHECK-GI-NEXT: bl __lttf2 -; CHECK-GI-NEXT: ldp q0, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: sbfx x8, x21, #0, #1 -; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload -; CHECK-GI-NEXT: sbfx x9, x22, #0, #1 -; CHECK-GI-NEXT: fmov d1, x8 +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v0.d[0] -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: sbfx x8, x22, #0, #1 +; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] +; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov v1.d[1], x8 ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: cset w8, lt ; CHECK-GI-NEXT: sbfx x8, x8, #0, #1 -; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: bic v0.16b, v3.16b, v0.16b ; CHECK-GI-NEXT: and x9, x19, x8 ; CHECK-GI-NEXT: bic x8, x20, x8 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: orr x8, x9, x8 -; CHECK-GI-NEXT: bic v1.16b, v3.16b, v1.16b -; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b +; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #176 @@ -831,21 +831,21 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x ; CHECK-GI-NEXT: fcmp d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] -; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: cset w9, mi -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: fmov d2, x9 +; CHECK-GI-NEXT: mov v2.d[0], x9 +; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v3.s[0], w9 ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mov v3.s[1], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: neg v1.4s, v1.4s -; CHECK-GI-NEXT: mov v2.s[2], w8 +; CHECK-GI-NEXT: mov v3.s[2], w9 ; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b @@ -902,18 +902,18 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, ; CHECK-GI-LABEL: v3f32_float: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov v5.s[0], w9 ; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[1], w9 ; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[2], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v4.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: neg v1.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b @@ -980,18 +980,18 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i ; CHECK-GI-LABEL: v3f32_i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov v5.s[0], w9 ; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[1], w9 ; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[2], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v4.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: neg v1.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b @@ -1106,44 +1106,38 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-NOFP16-LABEL: v7f16_half: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5] -; CHECK-GI-NOFP16-NEXT: fmov s4, w8 -; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov w8, #65535 // =0xffff -; CHECK-GI-NOFP16-NEXT: mov h17, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h18, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h19, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4] +; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: fmov s5, w8 +; CHECK-GI-NOFP16-NEXT: mov w9, #65535 // =0xffff +; CHECK-GI-NOFP16-NEXT: fmov s7, w9 +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: mov v7.h[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], w8 +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v7.h[2], w9 ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v7.16b, v4.16b -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s6, w8 -; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.16b, v6.16b +; CHECK-GI-NOFP16-NEXT: mov v5.h[3], w8 +; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h +; CHECK-GI-NOFP16-NEXT: mov v7.h[3], w9 ; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v18.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v6.h[0] -; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v19.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h -; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v6.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v16.4s, v16.4h -; CHECK-GI-NOFP16-NEXT: mov v7.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v6.h[0] -; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v5.4s -; CHECK-GI-NOFP16-NEXT: mov v7.h[4], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v6.h[0] +; CHECK-GI-NOFP16-NEXT: mov v5.h[4], w8 +; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v6.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: mov v7.h[4], w9 +; CHECK-GI-NOFP16-NEXT: mov v5.h[5], w8 ; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NOFP16-NEXT: mov v7.h[5], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v6.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[6], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v6.h[0] -; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v7.8h -; CHECK-GI-NOFP16-NEXT: neg v1.8h, v7.8h +; CHECK-GI-NOFP16-NEXT: mov v7.h[5], w9 +; CHECK-GI-NOFP16-NEXT: mov v5.h[6], w8 +; CHECK-GI-NOFP16-NEXT: mov v7.h[6], w9 +; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h +; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h ; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h -; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v17.16b +; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v7.16b ; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NOFP16-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b @@ -1152,28 +1146,26 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-FP16-LABEL: v7f16_half: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: mov w8, #15 // =0xf +; CHECK-GI-FP16-NEXT: mov w9, #65535 // =0xffff ; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h ; CHECK-GI-FP16-NEXT: fmov s4, w8 -; CHECK-GI-FP16-NEXT: mov w8, #65535 // =0xffff -; CHECK-GI-FP16-NEXT: fmov s6, w8 -; CHECK-GI-FP16-NEXT: mov v5.16b, v4.16b -; CHECK-GI-FP16-NEXT: mov v7.16b, v6.16b -; CHECK-GI-FP16-NEXT: mov v5.h[1], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[1], v6.h[0] -; CHECK-GI-FP16-NEXT: mov v5.h[2], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[2], v6.h[0] -; CHECK-GI-FP16-NEXT: mov v5.h[3], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[3], v6.h[0] -; CHECK-GI-FP16-NEXT: mov v5.h[4], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[4], v6.h[0] -; CHECK-GI-FP16-NEXT: mov v5.h[5], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[5], v6.h[0] -; CHECK-GI-FP16-NEXT: mov v5.h[6], v4.h[0] -; CHECK-GI-FP16-NEXT: mov v7.h[6], v6.h[0] -; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v5.8h -; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h +; CHECK-GI-FP16-NEXT: fmov s5, w9 +; CHECK-GI-FP16-NEXT: mov v4.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[1], w9 +; CHECK-GI-FP16-NEXT: mov v4.h[2], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[2], w9 +; CHECK-GI-FP16-NEXT: mov v4.h[3], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[3], w9 +; CHECK-GI-FP16-NEXT: mov v4.h[4], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[4], w9 +; CHECK-GI-FP16-NEXT: mov v4.h[5], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[5], w9 +; CHECK-GI-FP16-NEXT: mov v4.h[6], w8 +; CHECK-GI-FP16-NEXT: mov v5.h[6], w9 +; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v4.8h +; CHECK-GI-FP16-NEXT: neg v1.8h, v4.8h ; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h -; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b +; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v5.16b ; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-FP16-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-GI-FP16-NEXT: orr v0.16b, v0.16b, v1.16b @@ -1599,59 +1591,52 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; ; CHECK-GI-NOFP16-LABEL: v7f16_i32: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] ; CHECK-GI-NOFP16-NEXT: mov w8, #31 // =0x1f -; CHECK-GI-NOFP16-NEXT: mov h4, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h5, v1.h[5] -; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #32] -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[6] -; CHECK-GI-NOFP16-NEXT: fmov s16, w0 -; CHECK-GI-NOFP16-NEXT: fmov s18, w4 +; CHECK-GI-NOFP16-NEXT: mov v4.s[0], w8 +; CHECK-GI-NOFP16-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-GI-NOFP16-NEXT: mov v5.s[0], w0 +; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w9 +; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w7 +; CHECK-GI-NOFP16-NEXT: ldr s16, [sp] +; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #24] +; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w1 +; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0] +; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v16.s[0] +; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #8] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8 ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s3, w8 -; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] -; CHECK-GI-NOFP16-NEXT: ldr s5, [sp] -; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1 -; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5 -; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8 -; CHECK-GI-NOFP16-NEXT: fmov w9, s5 -; CHECK-GI-NOFP16-NEXT: fmov s5, w7 -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v6.h[0] -; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v7.h[0] -; CHECK-GI-NOFP16-NEXT: ldr s7, [sp, #24] -; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w2 -; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w9 -; CHECK-GI-NOFP16-NEXT: fmov w9, s6 -; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 -; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0] -; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w2 +; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w9 +; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v16.s[0] +; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #40] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 -; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h -; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3 -; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v17.s[0] -; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v4.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: fmov s4, w8 -; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8 -; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: neg v3.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8 -; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: fmov w8, s6 -; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w8 -; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b -; CHECK-GI-NOFP16-NEXT: and v2.16b, v18.16b, v2.16b -; CHECK-GI-NOFP16-NEXT: and v1.16b, v7.16b, v1.16b -; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v16.16b, v5.16b +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v16.s[0] +; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w3 +; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v3.s[0], w4 +; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5 +; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #16] +; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6 +; CHECK-GI-NOFP16-NEXT: mov v7.s[3], v4.s[0] +; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v6.16b +; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b +; CHECK-GI-NOFP16-NEXT: and v1.16b, v17.16b, v1.16b +; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v5.16b, v7.16b ; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] ; CHECK-GI-NOFP16-NEXT: mov s3, v0.s[2] @@ -1670,59 +1655,56 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-LABEL: v7f16_i32: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h -; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f -; CHECK-GI-FP16-NEXT: ldr s3, [sp] -; CHECK-GI-FP16-NEXT: fmov s2, w10 -; CHECK-GI-FP16-NEXT: fmov s6, w0 -; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8] -; CHECK-GI-FP16-NEXT: fmov s17, w4 -; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24] +; CHECK-GI-FP16-NEXT: mov w9, #31 // =0x1f +; CHECK-GI-FP16-NEXT: mov v4.s[0], w0 +; CHECK-GI-FP16-NEXT: mov v2.s[0], w9 +; CHECK-GI-FP16-NEXT: mov v5.s[0], w7 +; CHECK-GI-FP16-NEXT: ldr s6, [sp] +; CHECK-GI-FP16-NEXT: mov v7.s[0], w4 ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32] +; CHECK-GI-FP16-NEXT: ldr s17, [sp, #8] ; CHECK-GI-FP16-NEXT: umov w8, v0.h[4] -; CHECK-GI-FP16-NEXT: umov w9, v0.h[5] -; CHECK-GI-FP16-NEXT: mov v2.s[1], w10 -; CHECK-GI-FP16-NEXT: mov v6.s[1], w1 -; CHECK-GI-FP16-NEXT: mov v17.s[1], w5 -; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0] +; CHECK-GI-FP16-NEXT: umov w10, v0.h[5] +; CHECK-GI-FP16-NEXT: mov v4.s[1], w1 +; CHECK-GI-FP16-NEXT: mov v2.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v5.s[1], v6.s[0] +; CHECK-GI-FP16-NEXT: ldr s6, [sp, #24] +; CHECK-GI-FP16-NEXT: mov v7.s[1], w5 +; CHECK-GI-FP16-NEXT: mov v6.s[1], v16.s[0] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] -; CHECK-GI-FP16-NEXT: fmov s1, w8 +; CHECK-GI-FP16-NEXT: mov v1.s[0], w8 ; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] -; CHECK-GI-FP16-NEXT: mov v2.s[2], w10 ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 -; CHECK-GI-FP16-NEXT: mov v17.s[2], w6 -; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] -; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 -; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-FP16-NEXT: fmov s5, w9 +; CHECK-GI-FP16-NEXT: mov v2.s[2], w9 +; CHECK-GI-FP16-NEXT: mov v4.s[2], w2 +; CHECK-GI-FP16-NEXT: mov v5.s[2], v17.s[0] +; CHECK-GI-FP16-NEXT: mov v7.s[2], w6 ; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 -; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 -; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 -; CHECK-GI-FP16-NEXT: fmov w8, s3 -; CHECK-GI-FP16-NEXT: fmov s3, w7 -; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v6.s[2], v16.s[0] +; CHECK-GI-FP16-NEXT: mov v1.s[1], w10 +; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff +; CHECK-GI-FP16-NEXT: mov v3.s[0], w10 +; CHECK-GI-FP16-NEXT: mov v4.s[3], w3 ; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31 -; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 -; CHECK-GI-FP16-NEXT: fmov w8, s4 -; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-FP16-NEXT: mov v3.s[1], w10 ; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s -; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 +; CHECK-GI-FP16-NEXT: mov v3.s[2], w10 ; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s -; CHECK-GI-FP16-NEXT: fmov w8, s4 -; CHECK-GI-FP16-NEXT: eor v2.16b, v1.16b, v5.16b -; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b -; CHECK-GI-FP16-NEXT: mov v3.s[3], w8 -; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v2.16b -; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b +; CHECK-GI-FP16-NEXT: ldr s2, [sp, #16] +; CHECK-GI-FP16-NEXT: mov v5.s[3], v2.s[0] +; CHECK-GI-FP16-NEXT: eor v3.16b, v1.16b, v3.16b +; CHECK-GI-FP16-NEXT: and v1.16b, v7.16b, v1.16b +; CHECK-GI-FP16-NEXT: and v2.16b, v6.16b, v3.16b +; CHECK-GI-FP16-NEXT: bsl v0.16b, v4.16b, v5.16b ; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b ; CHECK-GI-FP16-NEXT: mov s2, v0.s[1] ; CHECK-GI-FP16-NEXT: mov s3, v0.s[2] ; CHECK-GI-FP16-NEXT: mov s4, v0.s[3] +; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: mov s5, v1.s[1] ; CHECK-GI-FP16-NEXT: mov s6, v1.s[2] -; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: fmov w4, s1 ; CHECK-GI-FP16-NEXT: fmov w1, s2 ; CHECK-GI-FP16-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll index 84376107679d8..a42ec8e253be2 100644 --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -156,8 +156,8 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 ; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v2.s[0], w9 +; CHECK-GI-NEXT: mov v3.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mov v3.s[1], w8 ; CHECK-GI-NEXT: mov v2.s[2], w9 @@ -207,22 +207,20 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: mov w9, #32767 // =0x7fff ; CHECK-GI-NEXT: fmov s2, w9 ; CHECK-GI-NEXT: fmov s3, w8 -; CHECK-GI-NEXT: mov v4.16b, v2.16b -; CHECK-GI-NEXT: mov v5.16b, v3.16b -; CHECK-GI-NEXT: mov v4.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[1], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[3], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[3], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[4], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[5], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[5], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[6], v2.h[0] -; CHECK-GI-NEXT: mov v5.h[6], v3.h[0] -; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-GI-NEXT: and v1.16b, v1.16b, v5.16b +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: mov v3.h[2], w8 +; CHECK-GI-NEXT: mov v2.h[3], w9 +; CHECK-GI-NEXT: mov v3.h[3], w8 +; CHECK-GI-NEXT: mov v2.h[4], w9 +; CHECK-GI-NEXT: mov v3.h[4], w8 +; CHECK-GI-NEXT: mov v2.h[5], w9 +; CHECK-GI-NEXT: mov v3.h[5], w8 +; CHECK-GI-NEXT: mov v2.h[6], w9 +; CHECK-GI-NEXT: mov v3.h[6], w8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll index 1c761ea083028..b408e9c1bd4e6 100644 --- a/llvm/test/CodeGen/AArch64/fcvt.ll +++ b/llvm/test/CodeGen/AArch64/fcvt.ll @@ -164,27 +164,21 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: ceil_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frintp v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: ceil_v7f16: @@ -469,27 +463,21 @@ define <7 x half> @floor_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: floor_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frintm v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: floor_v7f16: @@ -774,27 +762,21 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frinti v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: nearbyint_v7f16: @@ -1079,27 +1061,21 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frintn v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: roundeven_v7f16: @@ -1384,27 +1360,21 @@ define <7 x half> @rint_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: rint_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frintx v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: rint_v7f16: @@ -1689,27 +1659,21 @@ define <7 x half> @round_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: round_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frinta v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: round_v7f16: @@ -1994,27 +1958,21 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: trunc_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: frintz v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: trunc_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index d73a5dc73eefc..5bdccccc62b99 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -188,33 +188,25 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fdiv v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v1.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fdiv_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll index 93d3d96d67b65..30ce389f23128 100644 --- a/llvm/test/CodeGen/AArch64/fexplog.ll +++ b/llvm/test/CodeGen/AArch64/fexplog.ll @@ -678,12 +678,12 @@ define <7 x half> @exp_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -701,18 +701,19 @@ define <7 x half> @exp_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -789,21 +790,21 @@ define <4 x half> @exp_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -919,12 +920,12 @@ define <8 x half> @exp_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -947,21 +948,21 @@ define <8 x half> @exp_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -1155,7 +1156,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h9 @@ -1180,7 +1181,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -1231,7 +1232,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -1257,7 +1258,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] @@ -1948,12 +1949,12 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -1971,18 +1972,19 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2059,21 +2061,21 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2189,12 +2191,12 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -2217,21 +2219,21 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2425,7 +2427,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h9 @@ -2450,7 +2452,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -2501,7 +2503,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -2527,7 +2529,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] @@ -3218,12 +3220,12 @@ define <7 x half> @log_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -3241,18 +3243,19 @@ define <7 x half> @log_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -3329,21 +3332,21 @@ define <4 x half> @log_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -3459,12 +3462,12 @@ define <8 x half> @log_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -3487,21 +3490,21 @@ define <8 x half> @log_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -3695,7 +3698,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h9 @@ -3720,7 +3723,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -3771,7 +3774,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -3797,7 +3800,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] @@ -4488,12 +4491,12 @@ define <7 x half> @log2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -4511,18 +4514,19 @@ define <7 x half> @log2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -4599,21 +4603,21 @@ define <4 x half> @log2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -4729,12 +4733,12 @@ define <8 x half> @log2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -4757,21 +4761,21 @@ define <8 x half> @log2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -4965,7 +4969,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h9 @@ -4990,7 +4994,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -5041,7 +5045,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -5067,7 +5071,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] @@ -5758,12 +5762,12 @@ define <7 x half> @log10_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -5781,18 +5785,19 @@ define <7 x half> @log10_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -5869,21 +5874,21 @@ define <4 x half> @log10_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -5999,12 +6004,12 @@ define <8 x half> @log10_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h11 @@ -6027,21 +6032,21 @@ define <8 x half> @log10_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -6235,7 +6240,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h9 @@ -6260,7 +6265,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -6311,7 +6316,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -6337,7 +6342,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index 2ea7e0f3c44a9..aa20304e52a95 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -11,13 +11,15 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { ; CHECK-GI-LABEL: interleave2_v4f16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup v2.4s, w8 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: xtn v2.4h, v2.4s -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: xtn v0.4h, v2.4s +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: mov v2.s[0], w9 +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h ; CHECK-GI-NEXT: ret %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) ret <4 x half> %retval diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll index 357d91960624b..fb12f8acf1745 100644 --- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll +++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll @@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4] -; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4] ; CHECK-NOFP16-GI-NEXT: fmin v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6] -; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s -; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h -; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4] +; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5] +; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5] +; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] +; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h +; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h +; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] +; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v1.4s, v3.4s +; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2] ; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3] ; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: min_v7f16: @@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4] -; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4] ; CHECK-NOFP16-GI-NEXT: fmax v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6] -; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s -; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h -; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4] +; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5] +; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5] +; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] +; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h +; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h +; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] +; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v1.4s, v3.4s +; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2] ; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3] ; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: max_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll index 61199f82615bb..64f0da8b4cd0f 100644 --- a/llvm/test/CodeGen/AArch64/fminmax.ll +++ b/llvm/test/CodeGen/AArch64/fminmax.ll @@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4] -; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4] ; CHECK-NOFP16-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6] -; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s -; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h -; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4] +; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5] +; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5] +; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] +; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h +; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h +; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] +; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v1.4s, v3.4s +; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2] ; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3] ; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: min_v7f16: @@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4] -; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4] ; CHECK-NOFP16-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6] -; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s -; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0] -; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h -; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h -; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4] +; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5] +; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5] +; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] +; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h +; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h +; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] +; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v1.4s, v3.4s +; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2] ; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0] -; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3] ; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2] -; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: max_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index 4b019b57d968d..7bcaae5a77eac 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -257,39 +257,29 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h17, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h18, v2.h[4] -; CHECK-GI-NOFP16-NEXT: mov h19, v2.h[5] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] -; CHECK-GI-NOFP16-NEXT: mov h2, v2.h[6] +; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fmla v5.4s, v4.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0] -; CHECK-GI-NOFP16-NEXT: mov v18.h[1], v19.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v5.4s -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v3.h[0] -; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov v18.h[2], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[2] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[3] -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v16.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v18.4h -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fmla v4.4s, v3.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v4.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v6.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v2.h[4] +; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v2.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v5.4h, v5.4s +; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v2.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v5.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v6.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v5.h[1] +; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fma_v7f16: @@ -864,46 +854,36 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s -; CHECK-GI-NOFP16-NEXT: mov h4, v2.h[5] -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v16.h[0] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v2.h[4] -; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v7.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4] +; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov h1, v2.h[6] -; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v5.4s, v6.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5] +; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] +; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fmuladd_v7f16: @@ -1362,46 +1342,36 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s -; CHECK-GI-NOFP16-NEXT: mov h4, v2.h[5] -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v16.h[0] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v2.h[4] -; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v7.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4] +; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov h1, v2.h[6] -; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v5.4s, v6.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5] +; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] +; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fmul_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll index 1f41f2385c335..bd3d1353e643e 100644 --- a/llvm/test/CodeGen/AArch64/fmul.ll +++ b/llvm/test/CodeGen/AArch64/fmul.ll @@ -188,33 +188,25 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0] -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h -; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4] +; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] +; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v1.4s, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fmul_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll index cc0f7d2fd6075..a0e9edff733e0 100644 --- a/llvm/test/CodeGen/AArch64/fneg.ll +++ b/llvm/test/CodeGen/AArch64/fneg.ll @@ -162,27 +162,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: fabs_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fneg v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fneg v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: fneg v2.4s, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fabs_v7f16: diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index 8d40121ad4543..6e8cd0c8c00b4 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -965,22 +965,22 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h12 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll index 5dbcaa4a5fda1..62fc1c0854ca8 100644 --- a/llvm/test/CodeGen/AArch64/fpowi.ll +++ b/llvm/test/CodeGen/AArch64/fpowi.ll @@ -869,22 +869,22 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) { ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index 0c880592d955b..20b5567e973d0 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -2585,7 +2585,7 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptos_v3f32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[2] ; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s ; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s ; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d @@ -2614,7 +2614,7 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptou_v3f32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[2] ; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s ; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s ; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d @@ -3181,10 +3181,10 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) { ; CHECK-GI-LABEL: fptos_v3f32_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov s2, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -3202,10 +3202,10 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) { ; CHECK-GI-LABEL: fptou_v3f32_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov s2, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -6077,10 +6077,10 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -6110,10 +6110,10 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -7297,7 +7297,7 @@ define <2 x i64> @fptos_v2f128_v2i64(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov x19, x0 ; CHECK-GI-NEXT: bl __fixtfdi -; CHECK-GI-NEXT: fmov d0, x19 +; CHECK-GI-NEXT: mov v0.d[0], x19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.d[1], x0 ; CHECK-GI-NEXT: add sp, sp, #32 @@ -7340,7 +7340,7 @@ define <2 x i64> @fptou_v2f128_v2i64(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov x19, x0 ; CHECK-GI-NEXT: bl __fixunstfdi -; CHECK-GI-NEXT: fmov d0, x19 +; CHECK-GI-NEXT: mov v0.d[0], x19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.d[1], x0 ; CHECK-GI-NEXT: add sp, sp, #32 @@ -7496,7 +7496,7 @@ define <2 x i32> @fptos_v2f128_v2i32(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7539,7 +7539,7 @@ define <2 x i32> @fptou_v2f128_v2i32(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7591,7 +7591,7 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -7644,7 +7644,7 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -7689,9 +7689,8 @@ define <2 x i16> @fptos_v2f128_v2i16(<2 x fp128> %a) { ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: fmov s0, w19 -; CHECK-GI-NEXT: fmov s1, w0 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w0 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #32 @@ -7734,9 +7733,8 @@ define <2 x i16> @fptou_v2f128_v2i16(<2 x fp128> %a) { ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: fmov s0, w19 -; CHECK-GI-NEXT: fmov s1, w0 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w0 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #32 @@ -7791,12 +7789,10 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) { ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: fmov s0, w19 -; CHECK-GI-NEXT: fmov s1, w20 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v0.h[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret @@ -7850,12 +7846,10 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) { ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: fmov s0, w19 -; CHECK-GI-NEXT: fmov s1, w20 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v0.h[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret @@ -7896,7 +7890,7 @@ define <2 x i8> @fptos_v2f128_v2i8(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7939,7 +7933,7 @@ define <2 x i8> @fptou_v2f128_v2i8(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index aec5d7959226c..c0d4ddef23132 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -261,9 +261,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fcvt s2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] ; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret entry: @@ -363,9 +363,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index 1a10fd2f1cdc3..fe5146d79895c 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -952,22 +952,22 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h12 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll index 0b34f9570fa77..557add3a4eaeb 100644 --- a/llvm/test/CodeGen/AArch64/fsincos.ll +++ b/llvm/test/CodeGen/AArch64/fsincos.ll @@ -678,12 +678,12 @@ define <7 x half> @sin_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -701,18 +701,19 @@ define <7 x half> @sin_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -789,21 +790,21 @@ define <4 x half> @sin_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -919,12 +920,12 @@ define <8 x half> @sin_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -947,21 +948,21 @@ define <8 x half> @sin_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -1155,7 +1156,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h9 @@ -1180,7 +1181,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -1231,7 +1232,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -1257,7 +1258,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] @@ -1948,12 +1949,12 @@ define <7 x half> @cos_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -1971,18 +1972,19 @@ define <7 x half> @cos_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2059,21 +2061,21 @@ define <4 x half> @cos_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2189,12 +2191,12 @@ define <8 x half> @cos_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h11 @@ -2217,21 +2219,21 @@ define <8 x half> @cos_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] ; CHECK-GI-NEXT: mov v0.16b, v1.16b @@ -2425,7 +2427,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h9 @@ -2450,7 +2452,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload @@ -2501,7 +2503,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload @@ -2527,7 +2529,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll index 4b48bcc5508db..6c5fd8e52b017 100644 --- a/llvm/test/CodeGen/AArch64/fsqrt.ll +++ b/llvm/test/CodeGen/AArch64/fsqrt.ll @@ -196,27 +196,21 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5] -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6] +; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4] ; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] -; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1] -; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NOFP16-NEXT: fsqrt v2.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2] ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: sqrt_v7f16: diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 6baf1a84d407c..b00e5d6c701d8 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -1228,18 +1228,18 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> ; CHECK-GI-LABEL: v3i32_i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov v5.s[0], w9 ; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[1], w9 ; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v5.s[2], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v4.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: neg v1.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 8b82004388b09..296e267a9c7f0 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -250,23 +250,13 @@ entry: } define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) { -; CHECK-SD-LABEL: insert_v3f32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-SD-NEXT: mov v1.s[1], v0.s[1] -; CHECK-SD-NEXT: mov v1.s[2], v0.s[2] -; CHECK-SD-NEXT: mov v0.16b, v1.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: insert_v3f32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-GI-NEXT: mov s0, v0.s[2] -; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: insert_v3f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: mov v1.s[1], v0.s[1] +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret entry: %d = insertelement <3 x float> %a, float %b, i32 0 ret <3 x float> %d @@ -281,10 +271,11 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3f32_2: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %d = insertelement <3 x float> %a, float %b, i32 2 @@ -983,11 +974,9 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3i32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: mov v1.s[0], w0 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] ; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret entry: @@ -1003,10 +992,10 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3i32_2: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], w0 +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret entry: %d = insertelement <3 x i32> %a, i32 %b, i32 2 diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 7a4c5cee27b80..4ac04798e1548 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -3309,30 +3309,28 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-GI-LABEL: stofp_v3i8_v3f64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v0.h[2], w2 ; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] ; CHECK-GI-NEXT: mov h2, v1.h[1] ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: smov x9, v0.s[1] -; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 -; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: smov x8, v0.s[0] +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: smov x8, v1.s[0] +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: smov x9, v1.s[1] +; CHECK-GI-NEXT: mov v1.d[0], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: smov x9, v0.s[1] -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: scvtf v0.2d, v1.2d -; CHECK-GI-NEXT: mov v2.d[1], x9 +; CHECK-GI-NEXT: scvtf v0.2d, v0.2d +; CHECK-GI-NEXT: scvtf v2.2d, v1.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: scvtf v2.2d, v2.2d ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret @@ -3363,30 +3361,28 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-GI-LABEL: utofp_v3i8_v3f64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] ; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v0.h[2], w2 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] ; CHECK-GI-NEXT: mov h2, v1.h[1] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: mov w8, v0.s[0] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov w8, v1.s[0] +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v1.d[0], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: ucvtf v0.2d, v1.2d -; CHECK-GI-NEXT: mov v2.d[1], x9 +; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d +; CHECK-GI-NEXT: ucvtf v2.2d, v1.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret @@ -4479,13 +4475,13 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: scvtf v1.2d, v2.2d +; CHECK-GI-NEXT: scvtf v2.2d, v2.2d ; CHECK-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d -; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d +; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i64> %a to <3 x float> @@ -4511,13 +4507,13 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ucvtf v1.2d, v2.2d +; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d ; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d -; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d +; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i64> %a to <3 x float> @@ -5267,10 +5263,8 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) { ; CHECK-GI-LABEL: stofp_v3i8_v3f32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v0.h[2], w2 ; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-NEXT: mov s1, v0.s[1] @@ -5302,11 +5296,9 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) { ; CHECK-GI-LABEL: utofp_v3i8_v3f32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] ; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v0.h[2], w2 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: mov h2, v0.h[1] @@ -6227,9 +6219,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: scvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -6276,9 +6268,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7215,9 +7207,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7238,9 +7230,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7448,9 +7440,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7459,8 +7451,8 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -7491,9 +7483,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7502,8 +7494,8 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -7986,9 +7978,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -7997,9 +7989,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-FP16-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-FP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-FP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-FP16-NEXT: xtn v0.4h, v1.4s ; CHECK-GI-FP16-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] @@ -8048,9 +8040,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: movi d1, #0x0000ff000000ff ; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -8059,16 +8051,16 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-FP16-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-FP16-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-FP16-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-FP16-NEXT: xtn v0.4h, v1.4s ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-FP16-NEXT: mov s1, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -8105,10 +8097,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-NOFP16-LABEL: stofp_v3i8_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fmov s0, w0 -; CHECK-GI-NOFP16-NEXT: fmov s1, w1 -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s1, w2 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w1 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w2 ; CHECK-GI-NOFP16-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-NOFP16-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] @@ -8126,10 +8116,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: fmov s0, w0 -; CHECK-GI-FP16-NEXT: fmov s1, w1 -; CHECK-GI-FP16-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-FP16-NEXT: fmov s1, w2 -; CHECK-GI-FP16-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-FP16-NEXT: mov v0.b[1], w1 +; CHECK-GI-FP16-NEXT: mov v0.b[2], w2 ; CHECK-GI-FP16-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ret @@ -8162,11 +8150,9 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-NOFP16-LABEL: utofp_v3i8_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fmov s0, w0 -; CHECK-GI-NOFP16-NEXT: fmov s1, w1 -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s1, w2 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0] ; CHECK-GI-NOFP16-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w1 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w2 ; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] @@ -8183,10 +8169,8 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: fmov s0, w0 -; CHECK-GI-FP16-NEXT: fmov s1, w1 -; CHECK-GI-FP16-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-FP16-NEXT: fmov s1, w2 -; CHECK-GI-FP16-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-FP16-NEXT: mov v0.b[1], w1 +; CHECK-GI-FP16-NEXT: mov v0.b[2], w2 ; CHECK-GI-FP16-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index 51d17ad0644f1..c1ea891bc86e7 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -267,21 +267,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h9 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h10 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; GISEL-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload ; GISEL-NEXT: fcvt h0, s0 +; GISEL-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; GISEL-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; GISEL-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload ; GISEL-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload -; GISEL-NEXT: mov v1.h[1], v2.h[0] -; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; GISEL-NEXT: mov v1.h[1], v3.h[0] ; GISEL-NEXT: mov v1.h[2], v2.h[0] ; GISEL-NEXT: mov v1.h[3], v0.h[0] ; GISEL-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index c3c0ec5e3d9d8..a4d1c53c272aa 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -118,7 +118,7 @@ define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){ ; ; CHECK-GI-LABEL: load_v2i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] ; CHECK-GI-NEXT: ldr b1, [x0, #1] ; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -158,8 +158,8 @@ define <2 x i16> @load_v2i16(ptr %ptr){ ; CHECK-GI-LABEL: load_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -235,6 +235,7 @@ define <7 x i8> @load_v7i8(ptr %ptr){ ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr b0, [x0] ; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: mov v0.b[0], v0.b[0] ; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] ; CHECK-GI-NEXT: ldr b1, [x0, #2] ; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] @@ -261,10 +262,10 @@ define <3 x i16> @load_v3i16(ptr %ptr){ ; CHECK-GI-LABEL: load_v3i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: add x8, x0, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %a = load <3 x i16>, ptr %ptr @@ -280,18 +281,18 @@ define <7 x i16> @load_v7i16(ptr %ptr){ ; CHECK-GI-LABEL: load_v7i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #6] -; CHECK-GI-NEXT: mov v0.h[3], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #8] -; CHECK-GI-NEXT: mov v0.h[4], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #10] -; CHECK-GI-NEXT: mov v0.h[5], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #12] -; CHECK-GI-NEXT: mov v0.h[6], v1.h[0] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: add x8, x0, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-GI-NEXT: add x8, x0, #6 +; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8] +; CHECK-GI-NEXT: add x8, x0, #8 +; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8] +; CHECK-GI-NEXT: add x8, x0, #10 +; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8] +; CHECK-GI-NEXT: add x8, x0, #12 +; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8] ; CHECK-GI-NEXT: ret %a = load <7 x i16>, ptr %ptr ret <7 x i16> %a @@ -305,10 +306,11 @@ define <3 x i32> @load_v3i32(ptr %ptr){ ; ; CHECK-GI-LABEL: load_v3i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldp s0, s1, [x0] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: ldr s1, [x0, #8] -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: add x8, x0, #4 +; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-GI-NEXT: add x8, x0, #8 +; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8] ; CHECK-GI-NEXT: ret %a = load <3 x i32>, ptr %ptr ret <3 x i32> %a diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 02258bc47c54d..9e748c9641aa8 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -83,13 +83,13 @@ define void @v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -124,22 +124,18 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -171,27 +167,27 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -259,13 +255,13 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -293,18 +289,16 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] @@ -416,14 +410,14 @@ define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) { ; ; CHECK-GI-LABEL: v2i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v1.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret entry: %s = mul <2 x i64> %d, %e @@ -461,16 +455,16 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d3 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v3.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v3.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: mov v0.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov x9, d5 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -515,10 +509,10 @@ define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) { ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x9, x9, x12 -; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mov v0.d[0], x8 ; CHECK-GI-NEXT: mul x11, x13, x14 +; CHECK-GI-NEXT: mov v1.d[0], x9 ; CHECK-GI-NEXT: mov v0.d[1], x10 -; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index 50c0c8b11e751..dbb4270fb8002 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1120,12 +1120,10 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) { ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v3.16b, v1.16b -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v1.8h, v3.8b, #0 +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #15 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #15 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b @@ -1148,10 +1146,9 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) { ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v3.16b, v1.16b -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v1.h[0] -; CHECK-GI-NEXT: ushll v1.4s, v3.4h, #0 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: mov v1.d[1], v2.d[0] ; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 @@ -1199,12 +1196,10 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v4.16b, v2.16b -; CHECK-GI-NEXT: mov v4.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v4.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v4.b[3], v2.b[0] -; CHECK-GI-NEXT: ushll v2.8h, v4.8b, #0 +; CHECK-GI-NEXT: mov v2.b[1], w9 +; CHECK-GI-NEXT: mov v2.b[2], w9 +; CHECK-GI-NEXT: mov v2.b[3], w8 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: shl v2.4h, v2.4h, #15 ; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #15 ; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b @@ -1227,10 +1222,9 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s2, w8 ; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v4.16b, v2.16b -; CHECK-GI-NEXT: mov v4.h[1], v3.h[0] -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] ; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31 diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll index 59958afdd0d1e..adc89f7a0d99d 100644 --- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -2674,10 +2674,10 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) { ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: mov v1.16b, v0.16b -; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] -; CHECK-GI-NEXT: mov v0.h[1], v0.h[0] -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w8 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] ; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 @@ -2725,10 +2725,10 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) { ; CHECK-GI-NEXT: mov w8, #0 // =0x0 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: mov v1.16b, v0.16b -; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] -; CHECK-GI-NEXT: mov v0.h[1], v0.h[0] -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w8 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] ; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 6f4b090fb22bd..f5e566f49b91e 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -1266,133 +1266,99 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; ; CHECK-GI-LABEL: v20: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr w9, [sp, #64] -; CHECK-GI-NEXT: ldr w10, [sp, #72] -; CHECK-GI-NEXT: and w13, w2, #0xff -; CHECK-GI-NEXT: ldr w11, [sp, #80] -; CHECK-GI-NEXT: ldr w12, [sp, #88] -; CHECK-GI-NEXT: fmov s19, w13 -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #224] -; CHECK-GI-NEXT: fmov s16, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #232] -; CHECK-GI-NEXT: fmov s3, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #240] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #248] -; CHECK-GI-NEXT: fmov s1, w12 -; CHECK-GI-NEXT: fmov s7, w10 -; CHECK-GI-NEXT: and w10, w1, #0xff -; CHECK-GI-NEXT: fmov s5, w11 -; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: and w9, w0, #0xff -; CHECK-GI-NEXT: ldrb w11, [sp] -; CHECK-GI-NEXT: ldrb w12, [sp, #8] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: fmov s20, w10 -; CHECK-GI-NEXT: ldrb w9, [sp, #96] -; CHECK-GI-NEXT: ldrb w10, [sp, #104] -; CHECK-GI-NEXT: fmov s17, w11 -; CHECK-GI-NEXT: fmov s21, w12 -; CHECK-GI-NEXT: ldrb w11, [sp, #160] -; CHECK-GI-NEXT: mov v0.b[1], v16.b[0] -; CHECK-GI-NEXT: fmov s18, w9 -; CHECK-GI-NEXT: fmov s22, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #96] +; CHECK-GI-NEXT: and w11, w1, #0xff +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: ldrb w9, [sp] +; CHECK-GI-NEXT: ldrb w12, [sp, #104] +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #160] +; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: ldrb w9, [sp, #168] -; CHECK-GI-NEXT: mov v6.h[1], v20.h[0] -; CHECK-GI-NEXT: fmov s20, w11 -; CHECK-GI-NEXT: ldrb w10, [sp, #16] -; CHECK-GI-NEXT: mov v17.h[1], v21.h[0] -; CHECK-GI-NEXT: fmov s21, w9 -; CHECK-GI-NEXT: ldrb w9, [sp, #112] -; CHECK-GI-NEXT: mov v18.h[1], v22.h[0] -; CHECK-GI-NEXT: fmov s23, w10 -; CHECK-GI-NEXT: ldrb w10, [sp, #176] -; CHECK-GI-NEXT: and w11, w3, #0xff -; CHECK-GI-NEXT: mov v2.b[1], v7.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v6.h[2], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w9 -; CHECK-GI-NEXT: mov v20.h[1], v21.h[0] -; CHECK-GI-NEXT: ldrb w9, [sp, #24] -; CHECK-GI-NEXT: fmov s22, w11 -; CHECK-GI-NEXT: mov v17.h[2], v23.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w11 +; CHECK-GI-NEXT: ldrb w11, [sp, #8] +; CHECK-GI-NEXT: fmov s3, w10 +; CHECK-GI-NEXT: mov v2.h[1], w12 +; CHECK-GI-NEXT: and w10, w2, #0xff +; CHECK-GI-NEXT: and w12, w5, #0xff +; CHECK-GI-NEXT: mov v1.h[1], w11 ; CHECK-GI-NEXT: and w11, w4, #0xff -; CHECK-GI-NEXT: mov v18.h[2], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w10 -; CHECK-GI-NEXT: ldrb w10, [sp, #120] -; CHECK-GI-NEXT: fmov s23, w9 +; CHECK-GI-NEXT: mov v3.h[1], w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #112] +; CHECK-GI-NEXT: mov v0.h[2], w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #16] +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #176] +; CHECK-GI-NEXT: mov v1.h[2], w10 +; CHECK-GI-NEXT: and w10, w3, #0xff +; CHECK-GI-NEXT: mov v3.h[2], w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #120] +; CHECK-GI-NEXT: mov v0.h[3], w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #24] +; CHECK-GI-NEXT: mov v2.h[3], w9 ; CHECK-GI-NEXT: ldrb w9, [sp, #184] -; CHECK-GI-NEXT: mov v6.h[3], v22.h[0] -; CHECK-GI-NEXT: fmov s21, w11 -; CHECK-GI-NEXT: and w11, w6, #0xff -; CHECK-GI-NEXT: mov v2.b[2], v5.b[0] -; CHECK-GI-NEXT: mov v20.h[2], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w10 -; CHECK-GI-NEXT: fmov s16, w9 +; CHECK-GI-NEXT: mov v1.h[3], w10 +; CHECK-GI-NEXT: ldr w10, [sp, #64] +; CHECK-GI-NEXT: mov v3.h[3], w9 ; CHECK-GI-NEXT: ldrb w9, [sp, #128] -; CHECK-GI-NEXT: and w10, w5, #0xff -; CHECK-GI-NEXT: mov v17.h[3], v23.h[0] -; CHECK-GI-NEXT: mov v6.h[4], v21.h[0] -; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-NEXT: mov v18.h[3], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w9 -; CHECK-GI-NEXT: ldrb w9, [sp, #192] -; CHECK-GI-NEXT: mov v20.h[3], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w10 -; CHECK-GI-NEXT: ldrb w10, [sp, #32] -; CHECK-GI-NEXT: mov v2.b[3], v4.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mov v18.h[4], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w10 +; CHECK-GI-NEXT: mov v0.h[4], w11 +; CHECK-GI-NEXT: ldrb w11, [sp, #32] +; CHECK-GI-NEXT: fmov s4, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #192] +; CHECK-GI-NEXT: mov v2.h[4], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #72] +; CHECK-GI-NEXT: mov v1.h[4], w11 +; CHECK-GI-NEXT: ldr w11, [sp, #224] +; CHECK-GI-NEXT: mov v3.h[4], w10 ; CHECK-GI-NEXT: ldrb w10, [sp, #136] -; CHECK-GI-NEXT: mov v6.h[5], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w10 -; CHECK-GI-NEXT: ldrb w10, [sp, #48] -; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-GI-NEXT: mov v17.h[4], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w9 -; CHECK-GI-NEXT: ldrb w9, [sp, #40] -; CHECK-GI-NEXT: mov v18.h[5], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w9 +; CHECK-GI-NEXT: mov v4.b[1], w9 +; CHECK-GI-NEXT: fmov s5, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #232] +; CHECK-GI-NEXT: mov v0.h[5], w12 +; CHECK-GI-NEXT: ldrb w12, [sp, #40] +; CHECK-GI-NEXT: mov v2.h[5], w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #200] ; CHECK-GI-NEXT: ldrb w9, [sp, #144] -; CHECK-GI-NEXT: mov v20.h[4], v19.h[0] -; CHECK-GI-NEXT: fmov s19, w11 -; CHECK-GI-NEXT: ldrb w11, [sp, #200] -; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-GI-NEXT: fmov s7, w11 -; CHECK-GI-NEXT: mov v17.h[5], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w9 -; CHECK-GI-NEXT: ldrb w11, [sp, #208] -; CHECK-GI-NEXT: mov v6.h[6], v19.h[0] -; CHECK-GI-NEXT: ldrb w9, [sp, #56] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov v20.h[5], v7.h[0] -; CHECK-GI-NEXT: fmov s7, w10 -; CHECK-GI-NEXT: mov v18.h[6], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w11 +; CHECK-GI-NEXT: mov v5.b[1], w11 +; CHECK-GI-NEXT: mov v1.h[5], w12 +; CHECK-GI-NEXT: mov v3.h[5], w10 +; CHECK-GI-NEXT: ldr w10, [sp, #80] +; CHECK-GI-NEXT: ldr w12, [sp, #240] +; CHECK-GI-NEXT: and w11, w6, #0xff +; CHECK-GI-NEXT: mov v0.h[6], w11 +; CHECK-GI-NEXT: ldrb w11, [sp, #48] +; CHECK-GI-NEXT: mov v2.h[6], w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #208] +; CHECK-GI-NEXT: mov v4.b[2], w10 ; CHECK-GI-NEXT: ldrb w10, [sp, #152] -; CHECK-GI-NEXT: and w11, w7, #0xff -; CHECK-GI-NEXT: fmov s3, w11 -; CHECK-GI-NEXT: str q0, [x8, #64] -; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: mov v5.b[2], w12 +; CHECK-GI-NEXT: mov v1.h[6], w11 +; CHECK-GI-NEXT: ldr w11, [sp, #248] +; CHECK-GI-NEXT: mov v3.h[6], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #88] +; CHECK-GI-NEXT: and w12, w7, #0xff +; CHECK-GI-NEXT: mov v0.h[7], w12 +; CHECK-GI-NEXT: mov v2.h[7], w10 +; CHECK-GI-NEXT: ldrb w12, [sp, #56] +; CHECK-GI-NEXT: mov v4.b[3], w9 ; CHECK-GI-NEXT: ldrb w10, [sp, #216] -; CHECK-GI-NEXT: mov v17.h[6], v7.h[0] -; CHECK-GI-NEXT: mov v20.h[6], v16.h[0] -; CHECK-GI-NEXT: fmov s7, w9 -; CHECK-GI-NEXT: mov v6.h[7], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w10 -; CHECK-GI-NEXT: mov v18.h[7], v5.h[0] -; CHECK-GI-NEXT: mov v17.h[7], v7.h[0] -; CHECK-GI-NEXT: mov v20.h[7], v3.h[0] -; CHECK-GI-NEXT: add v1.8h, v6.8h, v18.8h -; CHECK-GI-NEXT: add v3.8h, v17.8h, v20.8h +; CHECK-GI-NEXT: mov v5.b[3], w11 +; CHECK-GI-NEXT: mov v1.h[7], w12 +; CHECK-GI-NEXT: mov v3.h[7], w10 +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: ushll v2.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll v4.8h, v5.8b, #0 +; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: add v2.4h, v2.4h, v4.4h ; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: stp q4, q1, [x8] -; CHECK-GI-NEXT: stp q2, q3, [x8, #32] +; CHECK-GI-NEXT: stp q3, q0, [x8] +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: stp q4, q1, [x8, #32] +; CHECK-GI-NEXT: str q2, [x8, #64] ; CHECK-GI-NEXT: ret entry: %s0s = zext <20 x i8> %s0 to <20 x i32> @@ -1497,107 +1463,83 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; ; CHECK-GI-LABEL: i12: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w12, [sp] +; CHECK-GI-NEXT: ldr w14, [sp, #32] ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: ldr w9, [sp, #8] -; CHECK-GI-NEXT: ldr w11, [sp, #32] -; CHECK-GI-NEXT: ldr w12, [sp, #40] -; CHECK-GI-NEXT: fmov s5, w7 -; CHECK-GI-NEXT: ldr w10, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr w16, [sp, #128] +; CHECK-GI-NEXT: ldr w17, [sp, #160] ; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: fmov s2, w12 +; CHECK-GI-NEXT: fmov s3, w14 +; CHECK-GI-NEXT: ldr w12, [sp, #64] +; CHECK-GI-NEXT: ldr w14, [sp, #96] +; CHECK-GI-NEXT: ldr w13, [sp, #8] +; CHECK-GI-NEXT: ldr w15, [sp, #40] ; CHECK-GI-NEXT: fmov s4, w12 -; CHECK-GI-NEXT: ldr w12, [sp, #96] -; CHECK-GI-NEXT: ldr w13, [sp, #104] -; CHECK-GI-NEXT: ldr w14, [sp, #128] -; CHECK-GI-NEXT: ldr w15, [sp, #136] -; CHECK-GI-NEXT: ldr w16, [sp, #160] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: fmov s7, w13 -; CHECK-GI-NEXT: fmov s16, w15 -; CHECK-GI-NEXT: ldr w17, [sp, #168] -; CHECK-GI-NEXT: ldr w9, [sp, #24] -; CHECK-GI-NEXT: ldr w13, [sp, #176] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w6 -; CHECK-GI-NEXT: fmov s17, w17 -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #56] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: fmov s6, w16 +; CHECK-GI-NEXT: fmov s7, w17 +; CHECK-GI-NEXT: fmov s5, w14 +; CHECK-GI-NEXT: mov v2.h[1], w13 +; CHECK-GI-NEXT: mov v3.h[1], w15 +; CHECK-GI-NEXT: ldr w13, [sp, #72] +; CHECK-GI-NEXT: ldr w15, [sp, #104] +; CHECK-GI-NEXT: ldr w12, [sp, #136] +; CHECK-GI-NEXT: ldr w18, [sp, #168] +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v1.h[1], w5 +; CHECK-GI-NEXT: mov v4.h[1], w13 +; CHECK-GI-NEXT: mov v5.h[1], w15 +; CHECK-GI-NEXT: mov v6.h[1], w12 +; CHECK-GI-NEXT: mov v7.h[1], w18 +; CHECK-GI-NEXT: ldr w10, [sp, #16] ; CHECK-GI-NEXT: ldr w11, [sp, #48] -; CHECK-GI-NEXT: mov v1.h[3], v5.h[0] -; CHECK-GI-NEXT: fmov s5, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #64] -; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w3 -; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] -; CHECK-GI-NEXT: fmov s5, w11 -; CHECK-GI-NEXT: ldr w11, [sp, #72] -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: fmov s6, w11 -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: mov v3.h[2], v5.h[0] -; CHECK-GI-NEXT: fmov s5, w10 -; CHECK-GI-NEXT: ldr w9, [sp, #80] -; CHECK-GI-NEXT: ldr w10, [sp, #112] -; CHECK-GI-NEXT: ldr w11, [sp, #144] -; CHECK-GI-NEXT: mov v2.h[3], v4.h[0] -; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NEXT: fmov s6, w12 -; CHECK-GI-NEXT: fmov s18, w11 -; CHECK-GI-NEXT: ldr w12, [sp, #88] +; CHECK-GI-NEXT: ldr w12, [sp, #80] +; CHECK-GI-NEXT: ldr w13, [sp, #112] +; CHECK-GI-NEXT: ldr w14, [sp, #144] +; CHECK-GI-NEXT: ldr w15, [sp, #176] +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w6 +; CHECK-GI-NEXT: mov v2.h[2], w10 +; CHECK-GI-NEXT: mov v3.h[2], w11 +; CHECK-GI-NEXT: mov v4.h[2], w12 +; CHECK-GI-NEXT: mov v5.h[2], w13 +; CHECK-GI-NEXT: mov v6.h[2], w14 +; CHECK-GI-NEXT: mov v7.h[2], w15 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: ldr w10, [sp, #88] +; CHECK-GI-NEXT: ldr w11, [sp, #120] +; CHECK-GI-NEXT: ldr w12, [sp, #152] +; CHECK-GI-NEXT: ldr w13, [sp, #184] +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w7 +; CHECK-GI-NEXT: mov v2.h[3], w8 +; CHECK-GI-NEXT: mov v3.h[3], w9 +; CHECK-GI-NEXT: mov v4.h[3], w10 +; CHECK-GI-NEXT: mov v5.h[3], w11 +; CHECK-GI-NEXT: mov v6.h[3], w12 +; CHECK-GI-NEXT: mov v7.h[3], w13 +; CHECK-GI-NEXT: movi v16.4s, #15, msl #8 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NEXT: fmov s7, w14 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mov v7.h[1], v16.h[0] -; CHECK-GI-NEXT: fmov s16, w16 -; CHECK-GI-NEXT: mov v16.h[1], v17.h[0] -; CHECK-GI-NEXT: fmov s17, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #152] -; CHECK-GI-NEXT: mov v7.h[2], v18.h[0] -; CHECK-GI-NEXT: fmov s18, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #120] -; CHECK-GI-NEXT: mov v5.h[2], v17.h[0] -; CHECK-GI-NEXT: fmov s17, w10 -; CHECK-GI-NEXT: ldr w10, [sp, #184] -; CHECK-GI-NEXT: mov v3.h[3], v18.h[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: fmov s18, w10 -; CHECK-GI-NEXT: mov v6.h[2], v17.h[0] -; CHECK-GI-NEXT: fmov s17, w13 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mov v16.h[2], v17.h[0] -; CHECK-GI-NEXT: fmov s17, w12 -; CHECK-GI-NEXT: mov v6.h[3], v4.h[0] -; CHECK-GI-NEXT: movi v4.4s, #15, msl #8 -; CHECK-GI-NEXT: mov v5.h[3], v17.h[0] -; CHECK-GI-NEXT: fmov s17, w9 -; CHECK-GI-NEXT: mov v16.h[3], v18.h[0] -; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-GI-NEXT: mov v7.h[3], v17.h[0] -; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b -; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 -; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-GI-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 ; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 -; CHECK-GI-NEXT: and v5.16b, v5.16b, v4.16b -; CHECK-GI-NEXT: add v1.4s, v1.4s, v6.4s -; CHECK-GI-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-GI-NEXT: and v4.16b, v16.16b, v4.16b -; CHECK-GI-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-GI-NEXT: add v2.4s, v2.4s, v7.4s -; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v16.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v16.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v16.16b +; CHECK-GI-NEXT: and v3.16b, v3.16b, v16.16b +; CHECK-GI-NEXT: and v4.16b, v4.16b, v16.16b +; CHECK-GI-NEXT: and v5.16b, v5.16b, v16.16b +; CHECK-GI-NEXT: and v6.16b, v6.16b, v16.16b +; CHECK-GI-NEXT: and v7.16b, v7.16b, v16.16b +; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s ; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i12> %s0 to <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll index 3dbc033dfab96..f83ac8ed642cc 100644 --- a/llvm/test/CodeGen/AArch64/neon-extmul.ll +++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll @@ -272,18 +272,18 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-GI-NEXT: mul x15, x15, x16 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v1.d[0], x9 ; CHECK-GI-NEXT: mul x13, x13, x18 -; CHECK-GI-NEXT: mov v0.d[1], x12 ; CHECK-GI-NEXT: mul x11, x11, x14 ; CHECK-GI-NEXT: mov x14, v6.d[1] +; CHECK-GI-NEXT: mov v0.d[1], x12 +; CHECK-GI-NEXT: mov v2.d[0], x10 ; CHECK-GI-NEXT: mov v1.d[1], x15 -; CHECK-GI-NEXT: fmov d2, x10 ; CHECK-GI-NEXT: mul x14, x14, x17 -; CHECK-GI-NEXT: fmov d3, x11 -; CHECK-GI-NEXT: mov v3.d[1], x13 +; CHECK-GI-NEXT: mov v3.d[0], x11 ; CHECK-GI-NEXT: mov v2.d[1], x14 +; CHECK-GI-NEXT: mov v3.d[1], x13 ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> @@ -423,22 +423,22 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) ; CHECK-GI-NEXT: mul x15, x15, x16 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v1.d[0], x9 ; CHECK-GI-NEXT: mul x13, x13, x18 -; CHECK-GI-NEXT: mov v0.d[1], x12 ; CHECK-GI-NEXT: mul x11, x11, x14 ; CHECK-GI-NEXT: mov x14, v18.d[1] +; CHECK-GI-NEXT: mov v0.d[1], x12 +; CHECK-GI-NEXT: mov v6.d[0], x10 ; CHECK-GI-NEXT: mov v1.d[1], x15 -; CHECK-GI-NEXT: fmov d6, x10 -; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d ; CHECK-GI-NEXT: mul x14, x14, x17 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v7.d[0], x11 ; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-GI-NEXT: fmov d7, x11 -; CHECK-GI-NEXT: mov v7.d[1], x13 ; CHECK-GI-NEXT: mov v6.d[1], x14 -; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d +; CHECK-GI-NEXT: mov v7.d[1], x13 ; CHECK-GI-NEXT: add v2.2d, v6.2d, v4.2d +; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll index 15763543113eb..2897741780f60 100644 --- a/llvm/test/CodeGen/AArch64/neon-perm.ll +++ b/llvm/test/CodeGen/AArch64/neon-perm.ll @@ -1741,12 +1741,13 @@ define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll index af283f6a093e9..3263a5e03c1fd 100644 --- a/llvm/test/CodeGen/AArch64/ptradd.ll +++ b/llvm/test/CodeGen/AArch64/ptradd.ll @@ -77,17 +77,18 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) { ; ; CHECK-GI-LABEL: vector_gep_v3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: smov x8, v3.s[0] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: smov x9, v3.s[1] -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: mov w8, v3.s[2] -; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: smov x9, v3.s[0] +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: smov x10, v3.s[1] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v4.d[0], x9 ; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov w8, v3.s[2] +; CHECK-GI-NEXT: mov v4.d[1], x10 ; CHECK-GI-NEXT: add x8, x9, w8, sxtw -; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v4.2d ; CHECK-GI-NEXT: str x8, [x0, #16] ; CHECK-GI-NEXT: str q0, [x0] ; CHECK-GI-NEXT: ret @@ -166,17 +167,18 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) { ; ; CHECK-GI-LABEL: vector_gep_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 -; CHECK-GI-NEXT: fmov x8, d2 ; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d2 ; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: str x8, [x0, #16] ; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: str x8, [x0, #16] ; CHECK-GI-NEXT: str q0, [x0] ; CHECK-GI-NEXT: ret entry: @@ -206,13 +208,21 @@ entry: } define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) { -; CHECK-LABEL: vector_gep_v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: mov v1.d[1], x2 -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-NEXT: str q0, [x4] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vector_gep_v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: mov v1.d[1], x2 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x4] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.d[0], x0 +; CHECK-GI-NEXT: mov v1.d[1], x2 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: str q0, [x4] +; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off store <2 x ptr> %g, ptr %p diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll index 81682c5f0ce85..d807635f5d87d 100644 --- a/llvm/test/CodeGen/AArch64/rem.ll +++ b/llvm/test/CodeGen/AArch64/rem.ll @@ -190,7 +190,7 @@ define <2 x i8> @sv2i8(<2 x i8> %d, <2 x i8> %e) { ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -284,7 +284,7 @@ define <4 x i8> @sv4i8(<4 x i8> %d, <4 x i8> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -363,12 +363,12 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-NEXT: fmov w13, s1 ; CHECK-GI-NEXT: mov w14, v1.s[1] ; CHECK-GI-NEXT: mov w15, v1.s[2] -; CHECK-GI-NEXT: mov w16, v1.s[3] ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v2.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v2.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v0.s[3] ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v2.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -381,11 +381,11 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s ; CHECK-GI-NEXT: sdiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v0.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v1.s[3] ; CHECK-GI-NEXT: sdiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v0.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s @@ -527,20 +527,20 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: mov w18, v1.s[1] ; CHECK-GI-NEXT: mov w0, v1.s[2] ; CHECK-GI-NEXT: mov w1, v1.s[3] -; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[1] ; CHECK-GI-NEXT: mov w9, v3.s[1] ; CHECK-GI-NEXT: fmov w2, s7 ; CHECK-GI-NEXT: mov w3, v7.s[1] ; CHECK-GI-NEXT: mov w4, v7.s[2] -; CHECK-GI-NEXT: mov w5, v7.s[3] -; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: sdiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[2] ; CHECK-GI-NEXT: mov w9, v3.s[2] -; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: mov v16.s[0], w11 +; CHECK-GI-NEXT: mov w11, v6.s[3] ; CHECK-GI-NEXT: sdiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[3] -; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: mov v16.s[1], w10 ; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 ; CHECK-GI-NEXT: mov v16.s[2], w9 @@ -552,7 +552,8 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: sdiv w15, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[2] ; CHECK-GI-NEXT: mov w13, v5.s[2] -; CHECK-GI-NEXT: fmov s17, w14 +; CHECK-GI-NEXT: mov v17.s[0], w14 +; CHECK-GI-NEXT: mov w14, v7.s[3] ; CHECK-GI-NEXT: sdiv w13, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: mov v17.s[1], w15 @@ -565,7 +566,7 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s ; CHECK-GI-NEXT: sdiv w17, w17, w18 ; CHECK-GI-NEXT: mov w18, v0.s[2] -; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: mov v18.s[0], w16 ; CHECK-GI-NEXT: sdiv w18, w18, w0 ; CHECK-GI-NEXT: mov w0, v0.s[3] ; CHECK-GI-NEXT: mov v18.s[1], w17 @@ -579,11 +580,10 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h ; CHECK-GI-NEXT: sdiv w2, w2, w3 ; CHECK-GI-NEXT: mov w3, v6.s[2] -; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: mov v19.s[0], w1 ; CHECK-GI-NEXT: sdiv w3, w3, w4 -; CHECK-GI-NEXT: mov w4, v6.s[3] ; CHECK-GI-NEXT: mov v19.s[1], w2 -; CHECK-GI-NEXT: sdiv w10, w4, w5 +; CHECK-GI-NEXT: sdiv w10, w11, w14 ; CHECK-GI-NEXT: mov v19.s[2], w3 ; CHECK-GI-NEXT: mov v19.s[3], w10 ; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s @@ -866,14 +866,13 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { ; ; CHECK-GI-LABEL: sv32i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sub sp, sp, #112 -; CHECK-GI-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 @@ -902,43 +901,41 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: fmov w9, s7 ; CHECK-GI-NEXT: mov w12, v7.s[3] ; CHECK-GI-NEXT: fmov w13, s5 -; CHECK-GI-NEXT: mov w14, v5.s[1] ; CHECK-GI-NEXT: mov w16, v5.s[3] ; CHECK-GI-NEXT: fmov w6, s19 ; CHECK-GI-NEXT: mov w7, v19.s[3] ; CHECK-GI-NEXT: fmov w21, s17 -; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: mov w23, v17.s[3] +; CHECK-GI-NEXT: sdiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[1] ; CHECK-GI-NEXT: mov w9, v7.s[1] -; CHECK-GI-NEXT: mov w22, v17.s[3] -; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: sdiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[2] ; CHECK-GI-NEXT: mov w9, v7.s[2] -; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v20.s[0], w11 ; CHECK-GI-NEXT: sdiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[3] ; CHECK-GI-NEXT: sshll2 v6.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov v20.s[1], w11 +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: sshll v28.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 ; CHECK-GI-NEXT: mov v20.s[2], w9 -; CHECK-GI-NEXT: sdiv w13, w12, w13 +; CHECK-GI-NEXT: sdiv w15, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[1] -; CHECK-GI-NEXT: str w8, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; CHECK-GI-NEXT: mov v20.s[3], w11 -; CHECK-GI-NEXT: sdiv w15, w12, w14 +; CHECK-GI-NEXT: mov w13, v5.s[1] +; CHECK-GI-NEXT: mov v20.s[3], w8 +; CHECK-GI-NEXT: sdiv w14, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[2] -; CHECK-GI-NEXT: mov w14, v5.s[2] +; CHECK-GI-NEXT: mov w13, v5.s[2] ; CHECK-GI-NEXT: sshll v5.4s, v6.4h, #0 -; CHECK-GI-NEXT: fmov s21, w13 -; CHECK-GI-NEXT: sdiv w14, w12, w14 +; CHECK-GI-NEXT: mov v21.s[0], w15 +; CHECK-GI-NEXT: sdiv w13, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: sshll2 v4.8h, v2.16b, #0 -; CHECK-GI-NEXT: mov v21.s[1], w15 +; CHECK-GI-NEXT: mov v21.s[1], w14 ; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: sshll v7.4s, v4.4h, #0 ; CHECK-GI-NEXT: sshll v30.4s, v2.4h, #0 @@ -947,72 +944,72 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mls v28.4s, v20.4s, v30.4s ; CHECK-GI-NEXT: sdiv w12, w12, w16 ; CHECK-GI-NEXT: fmov w16, s5 -; CHECK-GI-NEXT: mov v21.s[2], w14 -; CHECK-GI-NEXT: sdiv w18, w16, w17 +; CHECK-GI-NEXT: mov v21.s[2], w13 +; CHECK-GI-NEXT: sdiv w1, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[1] ; CHECK-GI-NEXT: mov w17, v7.s[1] ; CHECK-GI-NEXT: mov v21.s[3], w12 ; CHECK-GI-NEXT: mls v0.4s, v21.4s, v2.4s -; CHECK-GI-NEXT: sdiv w1, w16, w17 +; CHECK-GI-NEXT: sdiv w0, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[2] ; CHECK-GI-NEXT: mov w17, v7.s[2] -; CHECK-GI-NEXT: fmov s22, w18 +; CHECK-GI-NEXT: mov v22.s[0], w1 ; CHECK-GI-NEXT: uzp1 v0.8h, v28.8h, v0.8h -; CHECK-GI-NEXT: sdiv w0, w16, w17 +; CHECK-GI-NEXT: sdiv w18, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[3] ; CHECK-GI-NEXT: mov w17, v7.s[3] ; CHECK-GI-NEXT: sshll2 v5.4s, v6.8h, #0 ; CHECK-GI-NEXT: sshll2 v7.4s, v4.8h, #0 -; CHECK-GI-NEXT: mov v22.s[1], w1 +; CHECK-GI-NEXT: mov v22.s[1], w0 ; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 ; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 ; CHECK-GI-NEXT: fmov w2, s7 -; CHECK-GI-NEXT: mov w3, v7.s[3] +; CHECK-GI-NEXT: mov w4, v7.s[3] ; CHECK-GI-NEXT: sdiv w16, w16, w17 ; CHECK-GI-NEXT: fmov w17, s5 -; CHECK-GI-NEXT: mov v22.s[2], w0 +; CHECK-GI-NEXT: mov v22.s[2], w18 ; CHECK-GI-NEXT: sdiv w5, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[1] ; CHECK-GI-NEXT: mov w2, v7.s[1] ; CHECK-GI-NEXT: mov v22.s[3], w16 ; CHECK-GI-NEXT: mls v6.4s, v22.4s, v4.4s -; CHECK-GI-NEXT: sdiv w4, w17, w2 +; CHECK-GI-NEXT: sdiv w3, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[2] ; CHECK-GI-NEXT: mov w2, v7.s[2] -; CHECK-GI-NEXT: fmov s23, w5 +; CHECK-GI-NEXT: mov v23.s[0], w5 ; CHECK-GI-NEXT: sdiv w2, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[3] -; CHECK-GI-NEXT: mov v23.s[1], w4 -; CHECK-GI-NEXT: sdiv w17, w17, w3 -; CHECK-GI-NEXT: fmov w3, s18 +; CHECK-GI-NEXT: mov v23.s[1], w3 +; CHECK-GI-NEXT: sdiv w17, w17, w4 +; CHECK-GI-NEXT: fmov w4, s18 ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: sdiv w20, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[1] +; CHECK-GI-NEXT: sdiv w20, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[1] ; CHECK-GI-NEXT: mov w6, v19.s[1] ; CHECK-GI-NEXT: mov v23.s[3], w17 ; CHECK-GI-NEXT: mls v5.4s, v23.4s, v7.4s -; CHECK-GI-NEXT: sdiv w19, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[2] +; CHECK-GI-NEXT: sdiv w19, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[2] ; CHECK-GI-NEXT: mov w6, v19.s[2] -; CHECK-GI-NEXT: fmov s24, w20 +; CHECK-GI-NEXT: mov v24.s[0], w20 ; CHECK-GI-NEXT: uzp1 v2.8h, v6.8h, v5.8h ; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: sdiv w6, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[3] +; CHECK-GI-NEXT: sdiv w6, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[3] ; CHECK-GI-NEXT: mov v24.s[1], w19 -; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: sdiv w3, w3, w7 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: sdiv w4, w4, w7 ; CHECK-GI-NEXT: fmov w7, s16 ; CHECK-GI-NEXT: mov v24.s[2], w6 -; CHECK-GI-NEXT: sdiv w23, w7, w21 +; CHECK-GI-NEXT: sdiv w24, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[1] ; CHECK-GI-NEXT: mov w21, v17.s[1] -; CHECK-GI-NEXT: mov v24.s[3], w3 -; CHECK-GI-NEXT: sdiv w24, w7, w21 +; CHECK-GI-NEXT: mov v24.s[3], w4 +; CHECK-GI-NEXT: sdiv w22, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[2] ; CHECK-GI-NEXT: mov w21, v17.s[2] ; CHECK-GI-NEXT: sshll2 v17.8h, v1.16b, #0 -; CHECK-GI-NEXT: fmov s25, w23 +; CHECK-GI-NEXT: mov v25.s[0], w24 ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll v18.4s, v17.4h, #0 ; CHECK-GI-NEXT: sshll v29.4s, v1.4h, #0 @@ -1020,9 +1017,8 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: sdiv w21, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[3] ; CHECK-GI-NEXT: sshll2 v16.8h, v3.16b, #0 -; CHECK-GI-NEXT: mov v25.s[1], w24 +; CHECK-GI-NEXT: mov v25.s[1], w22 ; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: sshll v19.4s, v16.4h, #0 ; CHECK-GI-NEXT: sshll v31.4s, v3.4h, #0 ; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 @@ -1032,51 +1028,51 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mov w28, v19.s[3] ; CHECK-GI-NEXT: sshll2 v19.4s, v16.8h, #0 ; CHECK-GI-NEXT: sshll v16.4s, v16.4h, #0 -; CHECK-GI-NEXT: sdiv w7, w7, w22 -; CHECK-GI-NEXT: fmov w22, s18 +; CHECK-GI-NEXT: sdiv w7, w7, w23 +; CHECK-GI-NEXT: fmov w23, s18 ; CHECK-GI-NEXT: mov v25.s[2], w21 ; CHECK-GI-NEXT: mls v29.4s, v24.4s, v31.4s +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: fmov w29, s19 ; CHECK-GI-NEXT: mov w30, v19.s[1] -; CHECK-GI-NEXT: mov w8, v19.s[2] -; CHECK-GI-NEXT: mov w10, v19.s[3] -; CHECK-GI-NEXT: sdiv w25, w22, w25 -; CHECK-GI-NEXT: mov w22, v18.s[1] +; CHECK-GI-NEXT: mov w15, v19.s[2] +; CHECK-GI-NEXT: sdiv w25, w23, w25 +; CHECK-GI-NEXT: mov w23, v18.s[1] ; CHECK-GI-NEXT: mov v25.s[3], w7 ; CHECK-GI-NEXT: mls v1.4s, v25.4s, v3.4s -; CHECK-GI-NEXT: sdiv w26, w22, w26 -; CHECK-GI-NEXT: mov w22, v18.s[2] -; CHECK-GI-NEXT: fmov s26, w25 +; CHECK-GI-NEXT: sdiv w26, w23, w26 +; CHECK-GI-NEXT: mov w23, v18.s[2] +; CHECK-GI-NEXT: mov v26.s[0], w25 ; CHECK-GI-NEXT: uzp1 v1.8h, v29.8h, v1.8h -; CHECK-GI-NEXT: sdiv w27, w22, w27 -; CHECK-GI-NEXT: mov w22, v18.s[3] +; CHECK-GI-NEXT: sdiv w27, w23, w27 +; CHECK-GI-NEXT: mov w23, v18.s[3] ; CHECK-GI-NEXT: sshll2 v18.4s, v17.8h, #0 ; CHECK-GI-NEXT: mov v26.s[1], w26 ; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0 -; CHECK-GI-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w11, v18.s[2] ; CHECK-GI-NEXT: mov w9, v18.s[3] -; CHECK-GI-NEXT: sdiv w22, w22, w28 +; CHECK-GI-NEXT: sdiv w23, w23, w28 ; CHECK-GI-NEXT: fmov w28, s18 ; CHECK-GI-NEXT: mov v26.s[2], w27 ; CHECK-GI-NEXT: sdiv w28, w28, w29 ; CHECK-GI-NEXT: mov w29, v18.s[1] -; CHECK-GI-NEXT: mov v26.s[3], w22 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v26.s[3], w23 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mls v17.4s, v26.4s, v16.4s ; CHECK-GI-NEXT: sdiv w29, w29, w30 -; CHECK-GI-NEXT: mov w30, v18.s[2] -; CHECK-GI-NEXT: fmov s27, w28 -; CHECK-GI-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: sdiv w8, w30, w8 +; CHECK-GI-NEXT: mov v27.s[0], w28 +; CHECK-GI-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: sdiv w10, w11, w15 +; CHECK-GI-NEXT: mov w11, v19.s[3] ; CHECK-GI-NEXT: mov v27.s[1], w29 -; CHECK-GI-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: sdiv w9, w9, w10 -; CHECK-GI-NEXT: mov v27.s[2], w8 -; CHECK-GI-NEXT: mov v27.s[3], w9 +; CHECK-GI-NEXT: sdiv w8, w9, w11 +; CHECK-GI-NEXT: mov v27.s[2], w10 +; CHECK-GI-NEXT: mov v27.s[3], w8 ; CHECK-GI-NEXT: mls v18.4s, v27.4s, v19.4s ; CHECK-GI-NEXT: uzp1 v3.8h, v17.8h, v18.8h ; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %s = srem <32 x i8> %d, %e @@ -1113,7 +1109,7 @@ define <2 x i8> @uv2i8(<2 x i8> %d, <2 x i8> %e) { ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1206,7 +1202,7 @@ define <4 x i8> @uv4i8(<4 x i8> %d, <4 x i8> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -1285,12 +1281,12 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-NEXT: fmov w13, s1 ; CHECK-GI-NEXT: mov w14, v1.s[1] ; CHECK-GI-NEXT: mov w15, v1.s[2] -; CHECK-GI-NEXT: mov w16, v1.s[3] ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v2.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v2.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v0.s[3] ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v2.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -1303,11 +1299,11 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s ; CHECK-GI-NEXT: udiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v0.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v1.s[3] ; CHECK-GI-NEXT: udiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v0.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s @@ -1449,20 +1445,20 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: mov w18, v1.s[1] ; CHECK-GI-NEXT: mov w0, v1.s[2] ; CHECK-GI-NEXT: mov w1, v1.s[3] -; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: udiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[1] ; CHECK-GI-NEXT: mov w9, v3.s[1] ; CHECK-GI-NEXT: fmov w2, s7 ; CHECK-GI-NEXT: mov w3, v7.s[1] ; CHECK-GI-NEXT: mov w4, v7.s[2] -; CHECK-GI-NEXT: mov w5, v7.s[3] -; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[2] ; CHECK-GI-NEXT: mov w9, v3.s[2] -; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: mov v16.s[0], w11 +; CHECK-GI-NEXT: mov w11, v6.s[3] ; CHECK-GI-NEXT: udiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v2.s[3] -; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: mov v16.s[1], w10 ; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 ; CHECK-GI-NEXT: mov v16.s[2], w9 @@ -1474,7 +1470,8 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: udiv w15, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[2] ; CHECK-GI-NEXT: mov w13, v5.s[2] -; CHECK-GI-NEXT: fmov s17, w14 +; CHECK-GI-NEXT: mov v17.s[0], w14 +; CHECK-GI-NEXT: mov w14, v7.s[3] ; CHECK-GI-NEXT: udiv w13, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: mov v17.s[1], w15 @@ -1487,7 +1484,7 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s ; CHECK-GI-NEXT: udiv w17, w17, w18 ; CHECK-GI-NEXT: mov w18, v0.s[2] -; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: mov v18.s[0], w16 ; CHECK-GI-NEXT: udiv w18, w18, w0 ; CHECK-GI-NEXT: mov w0, v0.s[3] ; CHECK-GI-NEXT: mov v18.s[1], w17 @@ -1501,11 +1498,10 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h ; CHECK-GI-NEXT: udiv w2, w2, w3 ; CHECK-GI-NEXT: mov w3, v6.s[2] -; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: mov v19.s[0], w1 ; CHECK-GI-NEXT: udiv w3, w3, w4 -; CHECK-GI-NEXT: mov w4, v6.s[3] ; CHECK-GI-NEXT: mov v19.s[1], w2 -; CHECK-GI-NEXT: udiv w10, w4, w5 +; CHECK-GI-NEXT: udiv w10, w11, w14 ; CHECK-GI-NEXT: mov v19.s[2], w3 ; CHECK-GI-NEXT: mov v19.s[3], w10 ; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s @@ -1788,14 +1784,13 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; ; CHECK-GI-LABEL: uv32i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sub sp, sp, #112 -; CHECK-GI-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 @@ -1824,43 +1819,41 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: fmov w9, s7 ; CHECK-GI-NEXT: mov w12, v7.s[3] ; CHECK-GI-NEXT: fmov w13, s5 -; CHECK-GI-NEXT: mov w14, v5.s[1] ; CHECK-GI-NEXT: mov w16, v5.s[3] ; CHECK-GI-NEXT: fmov w6, s19 ; CHECK-GI-NEXT: mov w7, v19.s[3] ; CHECK-GI-NEXT: fmov w21, s17 -; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: mov w23, v17.s[3] +; CHECK-GI-NEXT: udiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[1] ; CHECK-GI-NEXT: mov w9, v7.s[1] -; CHECK-GI-NEXT: mov w22, v17.s[3] -; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[2] ; CHECK-GI-NEXT: mov w9, v7.s[2] -; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v20.s[0], w11 ; CHECK-GI-NEXT: udiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v6.s[3] ; CHECK-GI-NEXT: ushll2 v6.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov v20.s[1], w11 +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v28.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 ; CHECK-GI-NEXT: mov v20.s[2], w9 -; CHECK-GI-NEXT: udiv w13, w12, w13 +; CHECK-GI-NEXT: udiv w15, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[1] -; CHECK-GI-NEXT: str w8, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; CHECK-GI-NEXT: mov v20.s[3], w11 -; CHECK-GI-NEXT: udiv w15, w12, w14 +; CHECK-GI-NEXT: mov w13, v5.s[1] +; CHECK-GI-NEXT: mov v20.s[3], w8 +; CHECK-GI-NEXT: udiv w14, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[2] -; CHECK-GI-NEXT: mov w14, v5.s[2] +; CHECK-GI-NEXT: mov w13, v5.s[2] ; CHECK-GI-NEXT: ushll v5.4s, v6.4h, #0 -; CHECK-GI-NEXT: fmov s21, w13 -; CHECK-GI-NEXT: udiv w14, w12, w14 +; CHECK-GI-NEXT: mov v21.s[0], w15 +; CHECK-GI-NEXT: udiv w13, w12, w13 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: ushll2 v4.8h, v2.16b, #0 -; CHECK-GI-NEXT: mov v21.s[1], w15 +; CHECK-GI-NEXT: mov v21.s[1], w14 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: ushll v7.4s, v4.4h, #0 ; CHECK-GI-NEXT: ushll v30.4s, v2.4h, #0 @@ -1869,72 +1862,72 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mls v28.4s, v20.4s, v30.4s ; CHECK-GI-NEXT: udiv w12, w12, w16 ; CHECK-GI-NEXT: fmov w16, s5 -; CHECK-GI-NEXT: mov v21.s[2], w14 -; CHECK-GI-NEXT: udiv w18, w16, w17 +; CHECK-GI-NEXT: mov v21.s[2], w13 +; CHECK-GI-NEXT: udiv w1, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[1] ; CHECK-GI-NEXT: mov w17, v7.s[1] ; CHECK-GI-NEXT: mov v21.s[3], w12 ; CHECK-GI-NEXT: mls v0.4s, v21.4s, v2.4s -; CHECK-GI-NEXT: udiv w1, w16, w17 +; CHECK-GI-NEXT: udiv w0, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[2] ; CHECK-GI-NEXT: mov w17, v7.s[2] -; CHECK-GI-NEXT: fmov s22, w18 +; CHECK-GI-NEXT: mov v22.s[0], w1 ; CHECK-GI-NEXT: uzp1 v0.8h, v28.8h, v0.8h -; CHECK-GI-NEXT: udiv w0, w16, w17 +; CHECK-GI-NEXT: udiv w18, w16, w17 ; CHECK-GI-NEXT: mov w16, v5.s[3] ; CHECK-GI-NEXT: mov w17, v7.s[3] ; CHECK-GI-NEXT: ushll2 v5.4s, v6.8h, #0 ; CHECK-GI-NEXT: ushll2 v7.4s, v4.8h, #0 -; CHECK-GI-NEXT: mov v22.s[1], w1 +; CHECK-GI-NEXT: mov v22.s[1], w0 ; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 ; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-GI-NEXT: fmov w2, s7 -; CHECK-GI-NEXT: mov w3, v7.s[3] +; CHECK-GI-NEXT: mov w4, v7.s[3] ; CHECK-GI-NEXT: udiv w16, w16, w17 ; CHECK-GI-NEXT: fmov w17, s5 -; CHECK-GI-NEXT: mov v22.s[2], w0 +; CHECK-GI-NEXT: mov v22.s[2], w18 ; CHECK-GI-NEXT: udiv w5, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[1] ; CHECK-GI-NEXT: mov w2, v7.s[1] ; CHECK-GI-NEXT: mov v22.s[3], w16 ; CHECK-GI-NEXT: mls v6.4s, v22.4s, v4.4s -; CHECK-GI-NEXT: udiv w4, w17, w2 +; CHECK-GI-NEXT: udiv w3, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[2] ; CHECK-GI-NEXT: mov w2, v7.s[2] -; CHECK-GI-NEXT: fmov s23, w5 +; CHECK-GI-NEXT: mov v23.s[0], w5 ; CHECK-GI-NEXT: udiv w2, w17, w2 ; CHECK-GI-NEXT: mov w17, v5.s[3] -; CHECK-GI-NEXT: mov v23.s[1], w4 -; CHECK-GI-NEXT: udiv w17, w17, w3 -; CHECK-GI-NEXT: fmov w3, s18 +; CHECK-GI-NEXT: mov v23.s[1], w3 +; CHECK-GI-NEXT: udiv w17, w17, w4 +; CHECK-GI-NEXT: fmov w4, s18 ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: udiv w20, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[1] +; CHECK-GI-NEXT: udiv w20, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[1] ; CHECK-GI-NEXT: mov w6, v19.s[1] ; CHECK-GI-NEXT: mov v23.s[3], w17 ; CHECK-GI-NEXT: mls v5.4s, v23.4s, v7.4s -; CHECK-GI-NEXT: udiv w19, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[2] +; CHECK-GI-NEXT: udiv w19, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[2] ; CHECK-GI-NEXT: mov w6, v19.s[2] -; CHECK-GI-NEXT: fmov s24, w20 +; CHECK-GI-NEXT: mov v24.s[0], w20 ; CHECK-GI-NEXT: uzp1 v2.8h, v6.8h, v5.8h ; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: udiv w6, w3, w6 -; CHECK-GI-NEXT: mov w3, v18.s[3] +; CHECK-GI-NEXT: udiv w6, w4, w6 +; CHECK-GI-NEXT: mov w4, v18.s[3] ; CHECK-GI-NEXT: mov v24.s[1], w19 -; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: udiv w3, w3, w7 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: udiv w4, w4, w7 ; CHECK-GI-NEXT: fmov w7, s16 ; CHECK-GI-NEXT: mov v24.s[2], w6 -; CHECK-GI-NEXT: udiv w23, w7, w21 +; CHECK-GI-NEXT: udiv w24, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[1] ; CHECK-GI-NEXT: mov w21, v17.s[1] -; CHECK-GI-NEXT: mov v24.s[3], w3 -; CHECK-GI-NEXT: udiv w24, w7, w21 +; CHECK-GI-NEXT: mov v24.s[3], w4 +; CHECK-GI-NEXT: udiv w22, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[2] ; CHECK-GI-NEXT: mov w21, v17.s[2] ; CHECK-GI-NEXT: ushll2 v17.8h, v1.16b, #0 -; CHECK-GI-NEXT: fmov s25, w23 +; CHECK-GI-NEXT: mov v25.s[0], w24 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: ushll v18.4s, v17.4h, #0 ; CHECK-GI-NEXT: ushll v29.4s, v1.4h, #0 @@ -1942,9 +1935,8 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: udiv w21, w7, w21 ; CHECK-GI-NEXT: mov w7, v16.s[3] ; CHECK-GI-NEXT: ushll2 v16.8h, v3.16b, #0 -; CHECK-GI-NEXT: mov v25.s[1], w24 +; CHECK-GI-NEXT: mov v25.s[1], w22 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: ushll v19.4s, v16.4h, #0 ; CHECK-GI-NEXT: ushll v31.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 @@ -1954,51 +1946,51 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mov w28, v19.s[3] ; CHECK-GI-NEXT: ushll2 v19.4s, v16.8h, #0 ; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-GI-NEXT: udiv w7, w7, w22 -; CHECK-GI-NEXT: fmov w22, s18 +; CHECK-GI-NEXT: udiv w7, w7, w23 +; CHECK-GI-NEXT: fmov w23, s18 ; CHECK-GI-NEXT: mov v25.s[2], w21 ; CHECK-GI-NEXT: mls v29.4s, v24.4s, v31.4s +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: fmov w29, s19 ; CHECK-GI-NEXT: mov w30, v19.s[1] -; CHECK-GI-NEXT: mov w8, v19.s[2] -; CHECK-GI-NEXT: mov w10, v19.s[3] -; CHECK-GI-NEXT: udiv w25, w22, w25 -; CHECK-GI-NEXT: mov w22, v18.s[1] +; CHECK-GI-NEXT: mov w15, v19.s[2] +; CHECK-GI-NEXT: udiv w25, w23, w25 +; CHECK-GI-NEXT: mov w23, v18.s[1] ; CHECK-GI-NEXT: mov v25.s[3], w7 ; CHECK-GI-NEXT: mls v1.4s, v25.4s, v3.4s -; CHECK-GI-NEXT: udiv w26, w22, w26 -; CHECK-GI-NEXT: mov w22, v18.s[2] -; CHECK-GI-NEXT: fmov s26, w25 +; CHECK-GI-NEXT: udiv w26, w23, w26 +; CHECK-GI-NEXT: mov w23, v18.s[2] +; CHECK-GI-NEXT: mov v26.s[0], w25 ; CHECK-GI-NEXT: uzp1 v1.8h, v29.8h, v1.8h -; CHECK-GI-NEXT: udiv w27, w22, w27 -; CHECK-GI-NEXT: mov w22, v18.s[3] +; CHECK-GI-NEXT: udiv w27, w23, w27 +; CHECK-GI-NEXT: mov w23, v18.s[3] ; CHECK-GI-NEXT: ushll2 v18.4s, v17.8h, #0 ; CHECK-GI-NEXT: mov v26.s[1], w26 ; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0 -; CHECK-GI-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w11, v18.s[2] ; CHECK-GI-NEXT: mov w9, v18.s[3] -; CHECK-GI-NEXT: udiv w22, w22, w28 +; CHECK-GI-NEXT: udiv w23, w23, w28 ; CHECK-GI-NEXT: fmov w28, s18 ; CHECK-GI-NEXT: mov v26.s[2], w27 ; CHECK-GI-NEXT: udiv w28, w28, w29 ; CHECK-GI-NEXT: mov w29, v18.s[1] -; CHECK-GI-NEXT: mov v26.s[3], w22 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v26.s[3], w23 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mls v17.4s, v26.4s, v16.4s ; CHECK-GI-NEXT: udiv w29, w29, w30 -; CHECK-GI-NEXT: mov w30, v18.s[2] -; CHECK-GI-NEXT: fmov s27, w28 -; CHECK-GI-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: udiv w8, w30, w8 +; CHECK-GI-NEXT: mov v27.s[0], w28 +; CHECK-GI-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: udiv w10, w11, w15 +; CHECK-GI-NEXT: mov w11, v19.s[3] ; CHECK-GI-NEXT: mov v27.s[1], w29 -; CHECK-GI-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: udiv w9, w9, w10 -; CHECK-GI-NEXT: mov v27.s[2], w8 -; CHECK-GI-NEXT: mov v27.s[3], w9 +; CHECK-GI-NEXT: udiv w8, w9, w11 +; CHECK-GI-NEXT: mov v27.s[2], w10 +; CHECK-GI-NEXT: mov v27.s[3], w8 ; CHECK-GI-NEXT: mls v18.4s, v27.4s, v19.4s ; CHECK-GI-NEXT: uzp1 v3.8h, v17.8h, v18.8h ; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %s = urem <32 x i8> %d, %e @@ -2037,7 +2029,7 @@ define <2 x i16> @sv2i16(<2 x i16> %d, <2 x i16> %e) { ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -2086,11 +2078,9 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: sdiv w16, w14, w15 ; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: msub w8, w16, w15, w14 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -2139,7 +2129,7 @@ define <4 x i16> @sv4i16(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -2214,12 +2204,12 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) { ; CHECK-GI-NEXT: fmov w13, s1 ; CHECK-GI-NEXT: mov w14, v1.s[1] ; CHECK-GI-NEXT: mov w15, v1.s[2] -; CHECK-GI-NEXT: mov w16, v1.s[3] ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v2.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v2.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v0.s[3] ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v2.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -2232,11 +2222,11 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) { ; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s ; CHECK-GI-NEXT: sdiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v0.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v1.s[3] ; CHECK-GI-NEXT: sdiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v0.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s @@ -2397,18 +2387,17 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mov w1, v7.s[3] ; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[1] ; CHECK-GI-NEXT: mov w9, v5.s[1] ; CHECK-GI-NEXT: fmov w2, s7 ; CHECK-GI-NEXT: mov w3, v7.s[1] ; CHECK-GI-NEXT: mov w4, v7.s[2] -; CHECK-GI-NEXT: mov w5, v7.s[3] -; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: sdiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[2] ; CHECK-GI-NEXT: mov w9, v5.s[2] ; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: mov v16.s[0], w11 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: fmov w13, s5 ; CHECK-GI-NEXT: mov w14, v5.s[1] @@ -2417,7 +2406,7 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: sdiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[3] ; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: mov v16.s[1], w10 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 @@ -2428,7 +2417,8 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v0.4s, v16.4s, v2.4s ; CHECK-GI-NEXT: sdiv w14, w12, w14 ; CHECK-GI-NEXT: mov w12, v4.s[2] -; CHECK-GI-NEXT: fmov s17, w13 +; CHECK-GI-NEXT: mov v17.s[0], w13 +; CHECK-GI-NEXT: mov w13, v7.s[3] ; CHECK-GI-NEXT: sdiv w15, w12, w15 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: mov v17.s[1], w14 @@ -2441,13 +2431,14 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s ; CHECK-GI-NEXT: sdiv w17, w17, w18 ; CHECK-GI-NEXT: mov w18, v6.s[2] -; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: mov v18.s[0], w16 ; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v4.8h ; CHECK-GI-NEXT: sdiv w18, w18, w0 ; CHECK-GI-NEXT: mov w0, v6.s[3] ; CHECK-GI-NEXT: sshll2 v6.4s, v1.8h, #0 ; CHECK-GI-NEXT: mov v18.s[1], w17 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov w11, v6.s[3] ; CHECK-GI-NEXT: sdiv w0, w0, w1 ; CHECK-GI-NEXT: fmov w1, s6 ; CHECK-GI-NEXT: mov v18.s[2], w18 @@ -2457,11 +2448,10 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v1.4s, v18.4s, v3.4s ; CHECK-GI-NEXT: sdiv w2, w2, w3 ; CHECK-GI-NEXT: mov w3, v6.s[2] -; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: mov v19.s[0], w1 ; CHECK-GI-NEXT: sdiv w3, w3, w4 -; CHECK-GI-NEXT: mov w4, v6.s[3] ; CHECK-GI-NEXT: mov v19.s[1], w2 -; CHECK-GI-NEXT: sdiv w10, w4, w5 +; CHECK-GI-NEXT: sdiv w10, w11, w13 ; CHECK-GI-NEXT: mov v19.s[2], w3 ; CHECK-GI-NEXT: mov v19.s[3], w10 ; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s @@ -2502,7 +2492,7 @@ define <2 x i16> @uv2i16(<2 x i16> %d, <2 x i16> %e) { ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -2556,11 +2546,9 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: udiv w16, w14, w15 ; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: msub w8, w16, w15, w14 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -2609,7 +2597,7 @@ define <4 x i16> @uv4i16(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -2684,12 +2672,12 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) { ; CHECK-GI-NEXT: fmov w13, s1 ; CHECK-GI-NEXT: mov w14, v1.s[1] ; CHECK-GI-NEXT: mov w15, v1.s[2] -; CHECK-GI-NEXT: mov w16, v1.s[3] ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v2.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v2.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v0.s[3] ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v2.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -2702,11 +2690,11 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) { ; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s ; CHECK-GI-NEXT: udiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v0.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v1.s[3] ; CHECK-GI-NEXT: udiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v0.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s @@ -2867,18 +2855,17 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mov w1, v7.s[3] ; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: udiv w11, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[1] ; CHECK-GI-NEXT: mov w9, v5.s[1] ; CHECK-GI-NEXT: fmov w2, s7 ; CHECK-GI-NEXT: mov w3, v7.s[1] ; CHECK-GI-NEXT: mov w4, v7.s[2] -; CHECK-GI-NEXT: mov w5, v7.s[3] -; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[2] ; CHECK-GI-NEXT: mov w9, v5.s[2] ; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 -; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: mov v16.s[0], w11 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: fmov w13, s5 ; CHECK-GI-NEXT: mov w14, v5.s[1] @@ -2887,7 +2874,7 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: udiv w9, w8, w9 ; CHECK-GI-NEXT: mov w8, v4.s[3] ; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: mov v16.s[1], w10 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: fmov w12, s4 @@ -2898,7 +2885,8 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v0.4s, v16.4s, v2.4s ; CHECK-GI-NEXT: udiv w14, w12, w14 ; CHECK-GI-NEXT: mov w12, v4.s[2] -; CHECK-GI-NEXT: fmov s17, w13 +; CHECK-GI-NEXT: mov v17.s[0], w13 +; CHECK-GI-NEXT: mov w13, v7.s[3] ; CHECK-GI-NEXT: udiv w15, w12, w15 ; CHECK-GI-NEXT: mov w12, v4.s[3] ; CHECK-GI-NEXT: mov v17.s[1], w14 @@ -2911,13 +2899,14 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s ; CHECK-GI-NEXT: udiv w17, w17, w18 ; CHECK-GI-NEXT: mov w18, v6.s[2] -; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: mov v18.s[0], w16 ; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v4.8h ; CHECK-GI-NEXT: udiv w18, w18, w0 ; CHECK-GI-NEXT: mov w0, v6.s[3] ; CHECK-GI-NEXT: ushll2 v6.4s, v1.8h, #0 ; CHECK-GI-NEXT: mov v18.s[1], w17 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov w11, v6.s[3] ; CHECK-GI-NEXT: udiv w0, w0, w1 ; CHECK-GI-NEXT: fmov w1, s6 ; CHECK-GI-NEXT: mov v18.s[2], w18 @@ -2927,11 +2916,10 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: mls v1.4s, v18.4s, v3.4s ; CHECK-GI-NEXT: udiv w2, w2, w3 ; CHECK-GI-NEXT: mov w3, v6.s[2] -; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: mov v19.s[0], w1 ; CHECK-GI-NEXT: udiv w3, w3, w4 -; CHECK-GI-NEXT: mov w4, v6.s[3] ; CHECK-GI-NEXT: mov v19.s[1], w2 -; CHECK-GI-NEXT: udiv w10, w4, w5 +; CHECK-GI-NEXT: udiv w10, w11, w13 ; CHECK-GI-NEXT: mov v19.s[2], w3 ; CHECK-GI-NEXT: mov v19.s[3], w10 ; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s @@ -2970,7 +2958,7 @@ define <2 x i32> @sv2i32(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -3002,10 +2990,10 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) { ; ; CHECK-GI-LABEL: sv3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v1.s[1] ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] ; CHECK-GI-NEXT: mov s0, v0.s[2] ; CHECK-GI-NEXT: mov s1, v1.s[2] ; CHECK-GI-NEXT: sdiv w10, w8, w9 @@ -3015,11 +3003,11 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) { ; CHECK-GI-NEXT: fmov w15, s1 ; CHECK-GI-NEXT: sdiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sdiv w16, w14, w15 -; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.s[1], w9 -; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sdiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -3063,7 +3051,7 @@ define <4 x i32> @sv4i32(<4 x i32> %d, <4 x i32> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -3141,12 +3129,12 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) { ; CHECK-GI-NEXT: fmov w13, s3 ; CHECK-GI-NEXT: mov w14, v3.s[1] ; CHECK-GI-NEXT: mov w15, v3.s[2] -; CHECK-GI-NEXT: mov w16, v3.s[3] ; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: sdiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v1.s[3] ; CHECK-GI-NEXT: sdiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -3159,11 +3147,11 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) { ; CHECK-GI-NEXT: mls v0.4s, v4.4s, v2.4s ; CHECK-GI-NEXT: sdiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v1.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v3.s[3] ; CHECK-GI-NEXT: sdiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v1.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: sdiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v1.4s, v5.4s, v3.4s @@ -3201,7 +3189,7 @@ define <2 x i32> @uv2i32(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -3233,10 +3221,10 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) { ; ; CHECK-GI-LABEL: uv3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v1.s[1] ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] ; CHECK-GI-NEXT: mov s0, v0.s[2] ; CHECK-GI-NEXT: mov s1, v1.s[2] ; CHECK-GI-NEXT: udiv w10, w8, w9 @@ -3246,11 +3234,11 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) { ; CHECK-GI-NEXT: fmov w15, s1 ; CHECK-GI-NEXT: udiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: udiv w16, w14, w15 -; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.s[1], w9 -; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: udiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -3294,7 +3282,7 @@ define <4 x i32> @uv4i32(<4 x i32> %d, <4 x i32> %e) { ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[0], w8 ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v2.s[1], w9 @@ -3372,12 +3360,12 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) { ; CHECK-GI-NEXT: fmov w13, s3 ; CHECK-GI-NEXT: mov w14, v3.s[1] ; CHECK-GI-NEXT: mov w15, v3.s[2] -; CHECK-GI-NEXT: mov w16, v3.s[3] ; CHECK-GI-NEXT: udiv w8, w8, w9 ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: udiv w9, w9, w10 ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[0], w8 +; CHECK-GI-NEXT: mov w8, v1.s[3] ; CHECK-GI-NEXT: udiv w10, w10, w11 ; CHECK-GI-NEXT: mov w11, v0.s[3] ; CHECK-GI-NEXT: mov v4.s[1], w9 @@ -3390,11 +3378,11 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) { ; CHECK-GI-NEXT: mls v0.4s, v4.4s, v2.4s ; CHECK-GI-NEXT: udiv w13, w13, w14 ; CHECK-GI-NEXT: mov w14, v1.s[2] -; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: mov v5.s[0], w12 +; CHECK-GI-NEXT: mov w12, v3.s[3] ; CHECK-GI-NEXT: udiv w14, w14, w15 -; CHECK-GI-NEXT: mov w15, v1.s[3] ; CHECK-GI-NEXT: mov v5.s[1], w13 -; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: udiv w8, w8, w12 ; CHECK-GI-NEXT: mov v5.s[2], w14 ; CHECK-GI-NEXT: mov v5.s[3], w8 ; CHECK-GI-NEXT: mls v1.4s, v5.4s, v3.4s @@ -3427,14 +3415,14 @@ define <2 x i64> @sv2i64(<2 x i64> %d, <2 x i64> %e) { ; CHECK-GI-NEXT: mov x11, v0.d[1] ; CHECK-GI-NEXT: sdiv x8, x8, x9 ; CHECK-GI-NEXT: sdiv x11, x11, x10 -; CHECK-GI-NEXT: fmov d1, x8 +; CHECK-GI-NEXT: mov v1.d[0], x8 ; CHECK-GI-NEXT: mov v1.d[1], x11 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v1.d[1] -; CHECK-GI-NEXT: mul x9, x11, x9 -; CHECK-GI-NEXT: mul x8, x8, x10 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov x11, v1.d[1] +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: mul x9, x11, x10 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-GI-NEXT: ret entry: @@ -3481,21 +3469,21 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: sdiv x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x14, d3 -; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: sdiv x9, x9, x10 -; CHECK-GI-NEXT: fmov d6, x8 +; CHECK-GI-NEXT: mov v6.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 ; CHECK-GI-NEXT: mov v6.d[1], x9 ; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: sdiv x10, x8, x9 -; CHECK-GI-NEXT: fmov x13, d6 -; CHECK-GI-NEXT: mov x11, v6.d[1] -; CHECK-GI-NEXT: mul x13, x13, x14 -; CHECK-GI-NEXT: mul x11, x11, x12 -; CHECK-GI-NEXT: fmov d2, x13 +; CHECK-GI-NEXT: sdiv x12, x8, x9 +; CHECK-GI-NEXT: fmov x10, d6 +; CHECK-GI-NEXT: mov x13, v6.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x11, x13, x14 +; CHECK-GI-NEXT: mov v2.d[0], x10 ; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: msub x8, x10, x9, x8 +; CHECK-GI-NEXT: msub x8, x12, x9, x8 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -3542,26 +3530,26 @@ define <4 x i64> @sv4i64(<4 x i64> %d, <4 x i64> %e) { ; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: mov x15, v1.d[1] ; CHECK-GI-NEXT: sdiv x8, x8, x9 -; CHECK-GI-NEXT: sdiv x11, x11, x10 -; CHECK-GI-NEXT: fmov d2, x8 ; CHECK-GI-NEXT: sdiv x12, x12, x13 -; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: fmov x11, d2 -; CHECK-GI-NEXT: mov x8, v2.d[1] -; CHECK-GI-NEXT: mul x9, x11, x9 -; CHECK-GI-NEXT: mul x8, x8, x10 -; CHECK-GI-NEXT: fmov d2, x9 -; CHECK-GI-NEXT: mov v2.d[1], x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: sdiv x11, x11, x10 +; CHECK-GI-NEXT: mov v3.d[0], x12 ; CHECK-GI-NEXT: sdiv x15, x15, x14 -; CHECK-GI-NEXT: fmov d3, x12 -; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x11, v2.d[1] +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: mul x10, x11, x10 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: mov v3.d[1], x15 -; CHECK-GI-NEXT: fmov x11, d3 -; CHECK-GI-NEXT: mov x10, v3.d[1] -; CHECK-GI-NEXT: mul x11, x11, x13 -; CHECK-GI-NEXT: mul x10, x10, x14 -; CHECK-GI-NEXT: fmov d3, x11 -; CHECK-GI-NEXT: mov v3.d[1], x10 +; CHECK-GI-NEXT: mov v2.d[1], x10 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mul x9, x9, x13 +; CHECK-GI-NEXT: mul x11, x12, x14 +; CHECK-GI-NEXT: mov v3.d[0], x9 +; CHECK-GI-NEXT: mov v3.d[1], x11 ; CHECK-GI-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-GI-NEXT: ret entry: @@ -3592,14 +3580,14 @@ define <2 x i64> @uv2i64(<2 x i64> %d, <2 x i64> %e) { ; CHECK-GI-NEXT: mov x11, v0.d[1] ; CHECK-GI-NEXT: udiv x8, x8, x9 ; CHECK-GI-NEXT: udiv x11, x11, x10 -; CHECK-GI-NEXT: fmov d1, x8 +; CHECK-GI-NEXT: mov v1.d[0], x8 ; CHECK-GI-NEXT: mov v1.d[1], x11 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v1.d[1] -; CHECK-GI-NEXT: mul x9, x11, x9 -; CHECK-GI-NEXT: mul x8, x8, x10 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov x11, v1.d[1] +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: mul x9, x11, x10 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-GI-NEXT: ret entry: @@ -3646,21 +3634,21 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: udiv x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x14, d3 -; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: udiv x9, x9, x10 -; CHECK-GI-NEXT: fmov d6, x8 +; CHECK-GI-NEXT: mov v6.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 ; CHECK-GI-NEXT: mov v6.d[1], x9 ; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: udiv x10, x8, x9 -; CHECK-GI-NEXT: fmov x13, d6 -; CHECK-GI-NEXT: mov x11, v6.d[1] -; CHECK-GI-NEXT: mul x13, x13, x14 -; CHECK-GI-NEXT: mul x11, x11, x12 -; CHECK-GI-NEXT: fmov d2, x13 +; CHECK-GI-NEXT: udiv x12, x8, x9 +; CHECK-GI-NEXT: fmov x10, d6 +; CHECK-GI-NEXT: mov x13, v6.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x11, x13, x14 +; CHECK-GI-NEXT: mov v2.d[0], x10 ; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: msub x8, x10, x9, x8 +; CHECK-GI-NEXT: msub x8, x12, x9, x8 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -3707,26 +3695,26 @@ define <4 x i64> @uv4i64(<4 x i64> %d, <4 x i64> %e) { ; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: mov x15, v1.d[1] ; CHECK-GI-NEXT: udiv x8, x8, x9 -; CHECK-GI-NEXT: udiv x11, x11, x10 -; CHECK-GI-NEXT: fmov d2, x8 ; CHECK-GI-NEXT: udiv x12, x12, x13 -; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: fmov x11, d2 -; CHECK-GI-NEXT: mov x8, v2.d[1] -; CHECK-GI-NEXT: mul x9, x11, x9 -; CHECK-GI-NEXT: mul x8, x8, x10 -; CHECK-GI-NEXT: fmov d2, x9 -; CHECK-GI-NEXT: mov v2.d[1], x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: udiv x11, x11, x10 +; CHECK-GI-NEXT: mov v3.d[0], x12 ; CHECK-GI-NEXT: udiv x15, x15, x14 -; CHECK-GI-NEXT: fmov d3, x12 -; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x11, v2.d[1] +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: mul x10, x11, x10 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: mov v3.d[1], x15 -; CHECK-GI-NEXT: fmov x11, d3 -; CHECK-GI-NEXT: mov x10, v3.d[1] -; CHECK-GI-NEXT: mul x11, x11, x13 -; CHECK-GI-NEXT: mul x10, x10, x14 -; CHECK-GI-NEXT: fmov d3, x11 -; CHECK-GI-NEXT: mov v3.d[1], x10 +; CHECK-GI-NEXT: mov v2.d[1], x10 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mul x9, x9, x13 +; CHECK-GI-NEXT: mul x11, x12, x14 +; CHECK-GI-NEXT: mov v3.d[0], x9 +; CHECK-GI-NEXT: mov v3.d[1], x11 ; CHECK-GI-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index fa0447c2c5d79..adac75758220e 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -165,18 +165,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: sqadd v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: sqadd v0.8b, v3.8b, v5.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x2] ; CHECK-GI-NEXT: ret @@ -249,12 +251,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: sqadd v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-GI-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: str h0, [x2] ; CHECK-GI-NEXT: str h1, [x2, #2] diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 529a3b72e0971..0f256c1f18f58 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -224,15 +224,13 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) { ; CHECK-GI-NEXT: lsl w10, w2, #8 ; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: sxth w10, w10 ; CHECK-GI-NEXT: asr w8, w8, #8 ; CHECK-GI-NEXT: asr w9, w9, #8 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: asr w8, w10, #8 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: sxth w8, w10 +; CHECK-GI-NEXT: asr w8, w8, #8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -254,10 +252,10 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: sext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: sxtb w9, w1 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sxtb w8, w1 +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: sxtb w8, w2 -; CHECK-GI-NEXT: mov v0.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -311,7 +309,7 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: smov w8, v0.h[0] ; CHECK-GI-NEXT: smov w9, v0.h[1] -; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: smov w8, v0.h[2] ; CHECK-GI-NEXT: mov v1.s[1], w9 ; CHECK-GI-NEXT: mov v1.s[2], w8 @@ -391,15 +389,13 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) { ; CHECK-GI-NEXT: lsl w10, w2, #6 ; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: sxth w10, w10 ; CHECK-GI-NEXT: asr w8, w8, #6 ; CHECK-GI-NEXT: asr w9, w9, #6 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: asr w8, w10, #6 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: sxth w8, w10 +; CHECK-GI-NEXT: asr w8, w8, #6 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -421,10 +417,10 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: sext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sbfx w8, w0, #0, #10 -; CHECK-GI-NEXT: sbfx w9, w1, #0, #10 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sbfx w8, w1, #0, #10 +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: sbfx w8, w2, #0, #10 -; CHECK-GI-NEXT: mov v0.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -1033,43 +1029,29 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) { ; CHECK-GI-LABEL: sext_v16i10_v16i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s2, w1 +; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: fmov s3, w9 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w3 -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #32] -; CHECK-GI-NEXT: mov v0.h[3], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w4 -; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[4], w4 +; CHECK-GI-NEXT: mov v1.h[4], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #40] -; CHECK-GI-NEXT: mov v0.h[4], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[5], w5 +; CHECK-GI-NEXT: mov v1.h[5], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #48] -; CHECK-GI-NEXT: mov v0.h[5], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w6 -; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[6], w6 +; CHECK-GI-NEXT: mov v1.h[6], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #56] -; CHECK-GI-NEXT: mov v0.h[6], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w7 -; CHECK-GI-NEXT: mov v1.h[6], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w8 -; CHECK-GI-NEXT: mov v0.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v3.h[0] +; CHECK-GI-NEXT: mov v0.h[7], w7 +; CHECK-GI-NEXT: mov v1.h[7], w8 ; CHECK-GI-NEXT: shl v0.8h, v0.8h, #6 ; CHECK-GI-NEXT: shl v1.8h, v1.8h, #6 ; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #6 @@ -1123,54 +1105,42 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: ldr w9, [sp, #8] -; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w9, [sp, #32] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: ldr w10, [sp, #8] ; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s3, w9 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: fmov s5, w10 -; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: mov v0.h[1], w1 ; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: mov v1.h[1], w5 +; CHECK-GI-NEXT: mov v2.h[1], w10 +; CHECK-GI-NEXT: mov v3.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w6 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mov v3.h[2], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] ; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w6 -; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: mov v5.h[2], v6.h[0] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w3 -; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v2.h[0] -; CHECK-GI-NEXT: fmov s2, w7 -; CHECK-GI-NEXT: mov v5.h[3], v6.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w7 +; CHECK-GI-NEXT: mov v2.h[3], w8 +; CHECK-GI-NEXT: mov v3.h[3], w9 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v5.4h, #0 -; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: shl v0.4s, v0.4s, #22 -; CHECK-GI-NEXT: shl v3.4s, v3.4s, #22 -; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22 ; CHECK-GI-NEXT: shl v1.4s, v1.4s, #22 +; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22 +; CHECK-GI-NEXT: shl v3.4s, v3.4s, #22 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #22 -; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22 ; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #22 +; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22 +; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22 ; CHECK-GI-NEXT: ret entry: %c = sext <16 x i10> %a to <16 x i32> @@ -1228,67 +1198,55 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: ldr w10, [sp, #32] -; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w11, [sp, #40] ; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s3, w10 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v1.h[1], w5 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: mov v3.h[1], w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w6 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mov v3.h[2], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w10 -; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w2 -; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] -; CHECK-GI-NEXT: fmov s5, w8 -; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w6 -; CHECK-GI-NEXT: mov v2.h[3], v5.h[0] -; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w3 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w7 -; CHECK-GI-NEXT: ushll v6.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-GI-NEXT: mov v1.h[3], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54 +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w7 +; CHECK-GI-NEXT: mov v2.h[3], w8 +; CHECK-GI-NEXT: mov v3.h[3], w9 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: shl v18.2d, v2.2d, #54 -; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll v5.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-GI-NEXT: shl v4.2d, v4.2d, #54 -; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54 +; CHECK-GI-NEXT: ushll v6.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v2.2d, v2.4s, #0 ; CHECK-GI-NEXT: ushll v7.2d, v3.2s, #0 ; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-GI-NEXT: shl v4.2d, v4.2d, #54 +; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54 ; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54 ; CHECK-GI-NEXT: shl v17.2d, v1.2d, #54 -; CHECK-GI-NEXT: sshr v0.2d, v4.2d, #54 -; CHECK-GI-NEXT: sshr v1.2d, v16.2d, #54 -; CHECK-GI-NEXT: sshr v4.2d, v6.2d, #54 +; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54 +; CHECK-GI-NEXT: shl v18.2d, v2.2d, #54 ; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54 ; CHECK-GI-NEXT: shl v19.2d, v3.2d, #54 +; CHECK-GI-NEXT: sshr v0.2d, v4.2d, #54 +; CHECK-GI-NEXT: sshr v1.2d, v16.2d, #54 ; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54 ; CHECK-GI-NEXT: sshr v3.2d, v17.2d, #54 +; CHECK-GI-NEXT: sshr v4.2d, v6.2d, #54 ; CHECK-GI-NEXT: sshr v5.2d, v18.2d, #54 ; CHECK-GI-NEXT: sshr v6.2d, v7.2d, #54 ; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54 diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll index 9c8d3e0f07de8..951458da17c07 100644 --- a/llvm/test/CodeGen/AArch64/shift.ll +++ b/llvm/test/CodeGen/AArch64/shift.ll @@ -537,22 +537,29 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){ ; CHECK-GI-NEXT: mov h3, v1.h[1] ; CHECK-GI-NEXT: mov h4, v0.h[2] ; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: mov h6, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov h3, v1.h[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: mov v1.b[3], w9 ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = shl <4 x i8> %0, %1 @@ -587,10 +594,10 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){ ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] @@ -628,7 +635,7 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){ ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: lsl w8, w8, w9 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = shl <1 x i32> %0, %1 @@ -684,24 +691,31 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h3, v0.h[1] ; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: mov h5, v1.h[3] -; CHECK-GI-NEXT: mov h6, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v6.b[0] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov h2, v1.h[3] +; CHECK-GI-NEXT: fmov w9, s4 +; CHECK-GI-NEXT: mov h4, v0.h[3] +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s3 +; CHECK-GI-NEXT: mov h3, v0.h[2] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s3 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: mov v0.b[3], w8 ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = ashr <4 x i8> %0, %1 @@ -734,11 +748,11 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; CHECK-GI-LABEL: ashr_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s2, v1.s[1] +; CHECK-GI-NEXT: mov w8, v1.s[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s3, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[1], v3.h[0] +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] @@ -774,7 +788,7 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){ ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: asr w8, w8, w9 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = ashr <1 x i32> %0, %1 @@ -821,24 +835,31 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h3, v0.h[1] ; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: mov h5, v1.h[3] -; CHECK-GI-NEXT: mov h6, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v6.b[0] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov h2, v1.h[3] +; CHECK-GI-NEXT: fmov w9, s4 +; CHECK-GI-NEXT: mov h4, v0.h[3] +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s3 +; CHECK-GI-NEXT: mov h3, v0.h[2] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s3 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: mov v0.b[3], w8 ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b3, v0.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v3.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = lshr <4 x i8> %0, %1 @@ -870,11 +891,11 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; CHECK-GI-LABEL: lshr_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s2, v1.s[1] +; CHECK-GI-NEXT: mov w8, v1.s[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s3, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[1], v3.h[0] +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] @@ -910,7 +931,7 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){ ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: lsr w8, w8, w9 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %3 = lshr <1 x i32> %0, %1 @@ -962,16 +983,12 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-LABEL: shl_v3i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 -; CHECK-GI-NEXT: fmov s2, w3 -; CHECK-GI-NEXT: fmov s3, w4 -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v2.b[1], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w5 -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: fmov s1, w3 +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v1.b[1], w4 +; CHECK-GI-NEXT: mov v0.b[2], w2 +; CHECK-GI-NEXT: mov v1.b[2], w5 +; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w0, v0.b[0] ; CHECK-GI-NEXT: umov w1, v0.b[1] ; CHECK-GI-NEXT: umov w2, v0.b[2] @@ -1038,15 +1055,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-LABEL: ashr_v3i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmov s0, w3 -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s2, w1 -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] ; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w4 +; CHECK-GI-NEXT: mov v1.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w5 +; CHECK-GI-NEXT: mov v1.b[2], w2 ; CHECK-GI-NEXT: neg v0.8b, v0.8b ; CHECK-GI-NEXT: sshl v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: umov w0, v0.b[0] @@ -1118,15 +1131,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-LABEL: lshr_v3i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmov s0, w3 -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s2, w1 -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] ; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: mov v0.b[2], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w2 -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w4 +; CHECK-GI-NEXT: mov v1.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w5 +; CHECK-GI-NEXT: mov v1.b[2], w2 ; CHECK-GI-NEXT: neg v0.8b, v0.8b ; CHECK-GI-NEXT: ushl v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: umov w0, v0.b[0] diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index b1131f287fe9a..954458e445974 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -213,17 +213,23 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov h3, v1.h[1] -; CHECK-GI-NEXT: adrp x8, .LCPI15_0 ; CHECK-GI-NEXT: mov h4, v0.h[2] ; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: mov h6, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov h3, v1.h[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -280,11 +286,11 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov s2, v0.s[1] -; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 ; CHECK-GI-NEXT: adrp x8, .LCPI17_0 -; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -397,8 +403,17 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ ; ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: dup v0.8b, w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov h1, v0.h[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.b[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> @@ -433,8 +448,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: dup v0.4h, w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: dup v0.4h, v0.h[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> @@ -493,18 +510,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) { ; CHECK-GI-LABEL: shufflevector_v3i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: fmov s1, w3 ; CHECK-GI-NEXT: adrp x8, .LCPI30_0 -; CHECK-GI-NEXT: fmov s2, w3 -; CHECK-GI-NEXT: fmov s3, w4 -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v2.b[1], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w5 -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v1.b[1], w4 +; CHECK-GI-NEXT: mov v0.b[2], w2 +; CHECK-GI-NEXT: mov v1.b[2], w5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0] -; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v0.d[1], v2.d[0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b ; CHECK-GI-NEXT: mov b1, v0.b[1] ; CHECK-GI-NEXT: mov b2, v0.b[2] @@ -614,7 +627,10 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) { ; ; CHECK-GI-LABEL: shufflevector_v3i8_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.8b, w0 +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w2 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] ; CHECK-GI-NEXT: mov b1, v0.b[1] ; CHECK-GI-NEXT: mov b2, v0.b[2] ; CHECK-GI-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index d8b2762cf15e9..12371ef2c0021 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -166,18 +166,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: sqsub v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: sqsub v0.8b, v3.8b, v5.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x2] ; CHECK-GI-NEXT: ret @@ -250,12 +252,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: sqsub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-GI-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: str h0, [x2] ; CHECK-GI-NEXT: str h1, [x2, #2] diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 907605494dfbd..8e7586bd4843c 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: ldr b1, [x0, #1] -; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-GI-NEXT: ldr b2, [x0, #1] ; CHECK-GI-NEXT: ldr b3, [x1, #1] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: sub v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] +; CHECK-GI-NEXT: sub v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str b0, [x0] ; CHECK-GI-NEXT: str b1, [x0, #1] @@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #1] -; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w9, [x1] +; CHECK-GI-NEXT: ldrb w10, [x0, #1] ; CHECK-GI-NEXT: ldrb w11, [x1, #1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldrb w8, [x0, #2] ; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: sub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov h2, v0.h[2] ; CHECK-GI-NEXT: str b0, [x0] @@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] +; CHECK-GI-NEXT: xtn v0.8b, v1.8h ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: sub v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: add x10, x1, #4 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ldr h1, [x0, #4] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: ldr h3, [x1, #4] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] -; CHECK-GI-NEXT: sub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] +; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index afc0d8704ebac..e99935e8677fc 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -162,18 +162,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: uqadd v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: uqadd v0.8b, v3.8b, v5.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x2] ; CHECK-GI-NEXT: ret @@ -248,12 +250,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: uqadd v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-GI-NEXT: uqadd v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: str h0, [x2] ; CHECK-GI-NEXT: str h1, [x2, #2] diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index dfcbe96ea948a..cdba9625431a5 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -163,18 +163,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov b3, v1.b[1] -; CHECK-GI-NEXT: mov b4, v0.b[2] -; CHECK-GI-NEXT: mov b5, v0.b[3] -; CHECK-GI-NEXT: mov b6, v1.b[3] -; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v1.b[2] -; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] -; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] -; CHECK-GI-NEXT: uqsub v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[1] +; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] +; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] +; CHECK-GI-NEXT: mov b4, v1.b[2] +; CHECK-GI-NEXT: mov b1, v1.b[3] +; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] +; CHECK-GI-NEXT: uqsub v0.8b, v3.8b, v5.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x2] ; CHECK-GI-NEXT: ret @@ -245,12 +247,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: ldr h2, [x1] -; CHECK-GI-NEXT: ldr h3, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: uqsub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: add x9, x1, #2 +; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-GI-NEXT: uqsub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: str h0, [x2] ; CHECK-GI-NEXT: str h1, [x2, #2] diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 54ada05c90448..f46e6ae989ff2 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3813,71 +3813,49 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { ; CHECK-GI-LABEL: add_v24i8_v24i16_zext: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: ldr w10, [sp, #72] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w3 -; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w5 -; CHECK-GI-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w6 -; CHECK-GI-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w7 -; CHECK-GI-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w2 +; CHECK-GI-NEXT: mov v0.b[3], w3 +; CHECK-GI-NEXT: mov v0.b[4], w4 +; CHECK-GI-NEXT: mov v0.b[5], w5 +; CHECK-GI-NEXT: mov v0.b[6], w6 +; CHECK-GI-NEXT: mov v0.b[7], w7 +; CHECK-GI-NEXT: mov v0.b[8], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #64] ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #8] -; CHECK-GI-NEXT: fmov s3, w8 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.b[9], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #80] -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #88] -; CHECK-GI-NEXT: mov v0.b[9], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[1], w10 +; CHECK-GI-NEXT: mov v0.b[10], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #96] -; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #88] +; CHECK-GI-NEXT: mov v0.b[11], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #32] -; CHECK-GI-NEXT: mov v1.b[3], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #104] -; CHECK-GI-NEXT: mov v0.b[11], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #96] +; CHECK-GI-NEXT: mov v0.b[12], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #40] -; CHECK-GI-NEXT: mov v1.b[4], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #112] -; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[4], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #104] +; CHECK-GI-NEXT: mov v0.b[13], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #48] -; CHECK-GI-NEXT: mov v1.b[5], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #120] -; CHECK-GI-NEXT: mov v0.b[13], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[5], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #112] +; CHECK-GI-NEXT: mov v0.b[14], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #56] -; CHECK-GI-NEXT: mov v1.b[6], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 -; CHECK-GI-NEXT: mov v1.b[7], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-NEXT: uaddlv h1, v1.8b +; CHECK-GI-NEXT: mov v1.b[6], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #120] +; CHECK-GI-NEXT: mov v0.b[15], w8 +; CHECK-GI-NEXT: mov v1.b[7], w9 ; CHECK-GI-NEXT: uaddlv h0, v0.16b -; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: uaddlv h1, v1.8b ; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w0, w8, w9 ; CHECK-GI-NEXT: ret entry: @@ -3960,71 +3938,49 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { ; CHECK-GI-LABEL: add_v24i8_v24i16_sext: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: ldr w10, [sp, #72] -; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w3 -; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w5 -; CHECK-GI-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w6 -; CHECK-GI-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w7 -; CHECK-GI-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-NEXT: mov v0.b[1], w1 +; CHECK-GI-NEXT: mov v0.b[2], w2 +; CHECK-GI-NEXT: mov v0.b[3], w3 +; CHECK-GI-NEXT: mov v0.b[4], w4 +; CHECK-GI-NEXT: mov v0.b[5], w5 +; CHECK-GI-NEXT: mov v0.b[6], w6 +; CHECK-GI-NEXT: mov v0.b[7], w7 +; CHECK-GI-NEXT: mov v0.b[8], w8 +; CHECK-GI-NEXT: ldr w8, [sp, #64] ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: ldr w8, [sp, #8] -; CHECK-GI-NEXT: fmov s3, w8 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.b[9], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #80] -; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #88] -; CHECK-GI-NEXT: mov v0.b[9], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[1], w10 +; CHECK-GI-NEXT: mov v0.b[10], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #96] -; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[2], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #88] +; CHECK-GI-NEXT: mov v0.b[11], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #32] -; CHECK-GI-NEXT: mov v1.b[3], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #104] -; CHECK-GI-NEXT: mov v0.b[11], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #96] +; CHECK-GI-NEXT: mov v0.b[12], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #40] -; CHECK-GI-NEXT: mov v1.b[4], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #112] -; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[4], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #104] +; CHECK-GI-NEXT: mov v0.b[13], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #48] -; CHECK-GI-NEXT: mov v1.b[5], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #120] -; CHECK-GI-NEXT: mov v0.b[13], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[5], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #112] +; CHECK-GI-NEXT: mov v0.b[14], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #56] -; CHECK-GI-NEXT: mov v1.b[6], v2.b[0] -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-NEXT: fmov s3, w8 -; CHECK-GI-NEXT: mov v1.b[7], v2.b[0] -; CHECK-GI-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-NEXT: saddlv h1, v1.8b +; CHECK-GI-NEXT: mov v1.b[6], w9 +; CHECK-GI-NEXT: ldr w9, [sp, #120] +; CHECK-GI-NEXT: mov v0.b[15], w8 +; CHECK-GI-NEXT: mov v1.b[7], w9 ; CHECK-GI-NEXT: saddlv h0, v0.16b -; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: saddlv h1, v1.8b ; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w0, w8, w9 ; CHECK-GI-NEXT: ret entry: @@ -4168,71 +4124,49 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: fmov s0, w0 -; CHECK-GI-BASE-NEXT: fmov s1, w1 ; CHECK-GI-BASE-NEXT: ldr w8, [sp] -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64] +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8] ; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] -; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w2 -; CHECK-GI-BASE-NEXT: fmov s2, w10 -; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w3 -; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w4 -; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w5 -; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w6 -; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w7 -; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-BASE-NEXT: mov v0.b[1], w1 +; CHECK-GI-BASE-NEXT: mov v0.b[2], w2 +; CHECK-GI-BASE-NEXT: mov v0.b[3], w3 +; CHECK-GI-BASE-NEXT: mov v0.b[4], w4 +; CHECK-GI-BASE-NEXT: mov v0.b[5], w5 +; CHECK-GI-BASE-NEXT: mov v0.b[6], w6 +; CHECK-GI-BASE-NEXT: mov v0.b[7], w7 +; CHECK-GI-BASE-NEXT: mov v0.b[8], w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64] ; CHECK-GI-BASE-NEXT: fmov s1, w8 -; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8] -; CHECK-GI-BASE-NEXT: fmov s3, w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] -; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w9 +; CHECK-GI-BASE-NEXT: mov v0.b[9], w9 ; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] -; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] -; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[1], w10 +; CHECK-GI-BASE-NEXT: mov v0.b[10], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] -; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] -; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[2], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] +; CHECK-GI-BASE-NEXT: mov v0.b[11], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] -; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] -; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[3], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] +; CHECK-GI-BASE-NEXT: mov v0.b[12], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] -; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] -; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[4], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] +; CHECK-GI-BASE-NEXT: mov v0.b[13], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] -; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] -; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[5], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] +; CHECK-GI-BASE-NEXT: mov v0.b[14], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] -; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 -; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0] -; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b +; CHECK-GI-BASE-NEXT: mov v1.b[6], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] +; CHECK-GI-BASE-NEXT: mov v0.b[15], w8 +; CHECK-GI-BASE-NEXT: mov v1.b[7], w9 ; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b -; CHECK-GI-BASE-NEXT: fmov w9, s1 +; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b ; CHECK-GI-BASE-NEXT: fmov w8, s0 +; CHECK-GI-BASE-NEXT: fmov w9, s1 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff ; CHECK-GI-BASE-NEXT: ret @@ -4240,76 +4174,54 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext: ; CHECK-GI-DOT: // %bb.0: // %entry ; CHECK-GI-DOT-NEXT: fmov s0, w0 -; CHECK-GI-DOT-NEXT: fmov s1, w1 -; CHECK-GI-DOT-NEXT: ldr w8, [sp] ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] +; CHECK-GI-DOT-NEXT: ldr w8, [sp] ; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] -; CHECK-GI-DOT-NEXT: movi v4.8b, #1 -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] -; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w2 -; CHECK-GI-DOT-NEXT: fmov s3, w10 -; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w3 -; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w4 -; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w5 -; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w6 -; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w7 -; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-DOT-NEXT: movi v2.8b, #1 +; CHECK-GI-DOT-NEXT: movi v3.8b, #1 ; CHECK-GI-DOT-NEXT: fmov s1, w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v0.b[1], w1 +; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v1.b[1], w10 +; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] +; CHECK-GI-DOT-NEXT: mov v0.b[2], w2 +; CHECK-GI-DOT-NEXT: mov v1.b[2], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] -; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[3], w3 +; CHECK-GI-DOT-NEXT: mov v1.b[3], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] -; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[4], w4 +; CHECK-GI-DOT-NEXT: mov v1.b[4], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] -; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[5], w5 +; CHECK-GI-DOT-NEXT: mov v1.b[5], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] -; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[6], w6 +; CHECK-GI-DOT-NEXT: mov v1.b[6], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] -; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: mov v0.b[7], w7 +; CHECK-GI-DOT-NEXT: mov v1.b[7], w9 +; CHECK-GI-DOT-NEXT: mov v0.b[8], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] +; CHECK-GI-DOT-NEXT: fmov d1, d1 +; CHECK-GI-DOT-NEXT: mov v0.b[9], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] +; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b +; CHECK-GI-DOT-NEXT: mov v0.b[10], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] +; CHECK-GI-DOT-NEXT: mov v0.b[11], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] +; CHECK-GI-DOT-NEXT: mov v0.b[12], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] +; CHECK-GI-DOT-NEXT: mov v0.b[13], w8 ; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] -; CHECK-GI-DOT-NEXT: fmov s5, w9 -; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: mov v0.b[14], w8 ; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] -; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0] -; CHECK-GI-DOT-NEXT: movi v2.8b, #1 -; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0] -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w8 -; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0] -; CHECK-GI-DOT-NEXT: fmov d1, d1 -; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v4.16b -; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s +; CHECK-GI-DOT-NEXT: mov v0.b[15], w8 +; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret @@ -4484,71 +4396,49 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: fmov s0, w0 -; CHECK-GI-BASE-NEXT: fmov s1, w1 ; CHECK-GI-BASE-NEXT: ldr w8, [sp] -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64] +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8] ; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] -; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w2 -; CHECK-GI-BASE-NEXT: fmov s2, w10 -; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w3 -; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w4 -; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w5 -; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w6 -; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w7 -; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-BASE-NEXT: mov v0.b[1], w1 +; CHECK-GI-BASE-NEXT: mov v0.b[2], w2 +; CHECK-GI-BASE-NEXT: mov v0.b[3], w3 +; CHECK-GI-BASE-NEXT: mov v0.b[4], w4 +; CHECK-GI-BASE-NEXT: mov v0.b[5], w5 +; CHECK-GI-BASE-NEXT: mov v0.b[6], w6 +; CHECK-GI-BASE-NEXT: mov v0.b[7], w7 +; CHECK-GI-BASE-NEXT: mov v0.b[8], w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64] ; CHECK-GI-BASE-NEXT: fmov s1, w8 -; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8] -; CHECK-GI-BASE-NEXT: fmov s3, w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] -; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0] -; CHECK-GI-BASE-NEXT: fmov s1, w9 +; CHECK-GI-BASE-NEXT: mov v0.b[9], w9 ; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] -; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] -; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[1], w10 +; CHECK-GI-BASE-NEXT: mov v0.b[10], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] -; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] -; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[2], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] +; CHECK-GI-BASE-NEXT: mov v0.b[11], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] -; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] -; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[3], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] +; CHECK-GI-BASE-NEXT: mov v0.b[12], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] -; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] -; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[4], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] +; CHECK-GI-BASE-NEXT: mov v0.b[13], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] -; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] -; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[5], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] +; CHECK-GI-BASE-NEXT: mov v0.b[14], w8 ; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] -; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0] -; CHECK-GI-BASE-NEXT: fmov s2, w9 -; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-BASE-NEXT: fmov s3, w8 -; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0] -; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b +; CHECK-GI-BASE-NEXT: mov v1.b[6], w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] +; CHECK-GI-BASE-NEXT: mov v0.b[15], w8 +; CHECK-GI-BASE-NEXT: mov v1.b[7], w9 ; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b -; CHECK-GI-BASE-NEXT: fmov w9, s1 +; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b ; CHECK-GI-BASE-NEXT: fmov w8, s0 +; CHECK-GI-BASE-NEXT: fmov w9, s1 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: sxth w0, w8 ; CHECK-GI-BASE-NEXT: ret @@ -4556,76 +4446,54 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext: ; CHECK-GI-DOT: // %bb.0: // %entry ; CHECK-GI-DOT-NEXT: fmov s0, w0 -; CHECK-GI-DOT-NEXT: fmov s1, w1 -; CHECK-GI-DOT-NEXT: ldr w8, [sp] ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] +; CHECK-GI-DOT-NEXT: ldr w8, [sp] ; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] -; CHECK-GI-DOT-NEXT: movi v4.8b, #1 -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] -; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w2 -; CHECK-GI-DOT-NEXT: fmov s3, w10 -; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w3 -; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w4 -; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w5 -; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w6 -; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0] -; CHECK-GI-DOT-NEXT: fmov s1, w7 -; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-DOT-NEXT: movi v2.8b, #1 +; CHECK-GI-DOT-NEXT: movi v3.8b, #1 ; CHECK-GI-DOT-NEXT: fmov s1, w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v0.b[1], w1 +; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v1.b[1], w10 +; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] +; CHECK-GI-DOT-NEXT: mov v0.b[2], w2 +; CHECK-GI-DOT-NEXT: mov v1.b[2], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] -; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[3], w3 +; CHECK-GI-DOT-NEXT: mov v1.b[3], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] -; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[4], w4 +; CHECK-GI-DOT-NEXT: mov v1.b[4], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] -; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[5], w5 +; CHECK-GI-DOT-NEXT: mov v1.b[5], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] -; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 -; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: mov v0.b[6], w6 +; CHECK-GI-DOT-NEXT: mov v1.b[6], w9 ; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] -; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0] -; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: mov v0.b[7], w7 +; CHECK-GI-DOT-NEXT: mov v1.b[7], w9 +; CHECK-GI-DOT-NEXT: mov v0.b[8], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] +; CHECK-GI-DOT-NEXT: fmov d1, d1 +; CHECK-GI-DOT-NEXT: mov v0.b[9], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] +; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b +; CHECK-GI-DOT-NEXT: mov v0.b[10], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] +; CHECK-GI-DOT-NEXT: mov v0.b[11], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] +; CHECK-GI-DOT-NEXT: mov v0.b[12], w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] +; CHECK-GI-DOT-NEXT: mov v0.b[13], w8 ; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] -; CHECK-GI-DOT-NEXT: fmov s5, w9 -; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: mov v0.b[14], w8 ; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] -; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0] -; CHECK-GI-DOT-NEXT: movi v2.8b, #1 -; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0] -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0] -; CHECK-GI-DOT-NEXT: fmov s3, w8 -; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0] -; CHECK-GI-DOT-NEXT: fmov d1, d1 -; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0] -; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v4.16b -; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s +; CHECK-GI-DOT-NEXT: mov v0.b[15], w8 +; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index e536ba240453e..ead790203f949 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -127,12 +127,19 @@ entry: } define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) { -; CHECK-LABEL: xtn_v2i128_v2i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov v0.s[1], w2 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xtn_v2i128_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: mov v0.s[1], w2 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xtn_v2i128_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v0.s[1], w2 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i8> ret <2 x i8> %arg1 @@ -168,8 +175,7 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) { ; CHECK-GI-LABEL: xtn_v2i128_v2i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w2 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w2 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -189,23 +195,36 @@ entry: } define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) { -; CHECK-LABEL: xtn_v2i128_v2i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov v0.s[1], w2 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xtn_v2i128_v2i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: mov v0.s[1], w2 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xtn_v2i128_v2i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v0.s[1], w2 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i32> ret <2 x i32> %arg1 } define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) { -; CHECK-LABEL: xtn_v2i128_v2i64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: mov v0.d[1], x2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xtn_v2i128_v2i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: mov v0.d[1], x2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xtn_v2i128_v2i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.d[0], x0 +; CHECK-GI-NEXT: mov v0.d[1], x2 +; CHECK-GI-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i64> ret <2 x i64> %arg1 @@ -282,10 +301,10 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) { ; ; CHECK-GI-LABEL: xtn_v3i32_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov s2, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -309,11 +328,9 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) { ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: fmov x8, d2 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -334,10 +351,10 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) { ; CHECK-GI-LABEL: xtn_v3i64_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: fmov x8, d2 -; CHECK-GI-NEXT: mov v0.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index bb968c8eb00fc..7e95b6684e821 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -245,11 +245,9 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) { ; CHECK-GI-NEXT: and w8, w0, #0xff ; CHECK-GI-NEXT: and w9, w1, #0xff ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: and w8, w2, #0xff -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -271,10 +269,10 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: zext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: and w9, w1, #0xff -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: and w8, w1, #0xff +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0xff -; CHECK-GI-NEXT: mov v0.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -328,7 +326,7 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] -; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: umov w8, v0.h[2] ; CHECK-GI-NEXT: mov v1.s[1], w9 ; CHECK-GI-NEXT: mov v1.s[2], w8 @@ -406,11 +404,9 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) { ; CHECK-GI-NEXT: and w8, w0, #0x3ff ; CHECK-GI-NEXT: and w9, w1, #0x3ff ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: and w8, w2, #0x3ff -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -432,10 +428,10 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: zext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0x3ff -; CHECK-GI-NEXT: and w9, w1, #0x3ff -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: and w8, w1, #0x3ff +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0x3ff -; CHECK-GI-NEXT: mov v0.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: @@ -1089,51 +1085,39 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v16i10_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: fmov s3, w5 -; CHECK-GI-NEXT: ldr w9, [sp, #8] -; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w9, [sp, #32] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: ldr w10, [sp, #8] ; CHECK-GI-NEXT: ldr w11, [sp, #40] ; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s3, w9 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: fmov s5, w10 -; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: mov v0.h[1], w1 ; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w2 -; CHECK-GI-NEXT: mov v2.h[1], v4.h[0] -; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: movi v4.4s, #3, msl #8 +; CHECK-GI-NEXT: mov v1.h[1], w5 +; CHECK-GI-NEXT: mov v2.h[1], w10 +; CHECK-GI-NEXT: mov v3.h[1], w11 +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w6 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mov v3.h[2], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] ; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w6 -; CHECK-GI-NEXT: mov v2.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w8 -; CHECK-GI-NEXT: mov v5.h[2], v6.h[0] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w3 -; CHECK-GI-NEXT: mov v2.h[3], v4.h[0] -; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w7 -; CHECK-GI-NEXT: mov v5.h[3], v6.h[0] -; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] -; CHECK-GI-NEXT: movi v3.4s, #3, msl #8 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w7 +; CHECK-GI-NEXT: mov v2.h[3], w8 +; CHECK-GI-NEXT: mov v3.h[3], w9 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v5.4h, #0 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b -; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b ; CHECK-GI-NEXT: ret entry: %c = zext <16 x i10> %a to <16 x i32> @@ -1185,62 +1169,50 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w1 ; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: fmov s2, w5 -; CHECK-GI-NEXT: ldr w9, [sp, #8] ; CHECK-GI-NEXT: ldr w10, [sp, #32] -; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w11, [sp, #40] ; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s3, w10 ; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v1.h[1], w5 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: mov v3.h[1], w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mov v1.h[2], w6 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mov v3.h[2], w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] -; CHECK-GI-NEXT: fmov s3, w10 -; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w2 -; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] -; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: mov v0.h[3], w3 +; CHECK-GI-NEXT: mov v1.h[3], w7 +; CHECK-GI-NEXT: mov v2.h[3], w8 +; CHECK-GI-NEXT: mov v3.h[3], w9 ; CHECK-GI-NEXT: adrp x8, .LCPI54_0 ; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0] -; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w6 -; CHECK-GI-NEXT: mov v2.h[3], v5.h[0] -; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w9 -; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w3 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w7 -; CHECK-GI-NEXT: ushll v17.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll2 v18.2d, v2.4s, #0 -; CHECK-GI-NEXT: mov v1.h[3], v4.h[0] -; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll v6.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll2 v16.2d, v1.4s, #0 -; CHECK-GI-NEXT: and v0.16b, v4.16b, v7.16b -; CHECK-GI-NEXT: and v1.16b, v5.16b, v7.16b -; CHECK-GI-NEXT: and v4.16b, v17.16b, v7.16b -; CHECK-GI-NEXT: and v5.16b, v18.16b, v7.16b +; CHECK-GI-NEXT: ushll v17.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v18.2d, v2.4s, #0 ; CHECK-GI-NEXT: ushll v19.2d, v3.2s, #0 ; CHECK-GI-NEXT: ushll2 v20.2d, v3.4s, #0 +; CHECK-GI-NEXT: and v0.16b, v4.16b, v7.16b +; CHECK-GI-NEXT: and v1.16b, v5.16b, v7.16b ; CHECK-GI-NEXT: and v2.16b, v6.16b, v7.16b ; CHECK-GI-NEXT: and v3.16b, v16.16b, v7.16b +; CHECK-GI-NEXT: and v4.16b, v17.16b, v7.16b +; CHECK-GI-NEXT: and v5.16b, v18.16b, v7.16b ; CHECK-GI-NEXT: and v6.16b, v19.16b, v7.16b ; CHECK-GI-NEXT: and v7.16b, v20.16b, v7.16b ; CHECK-GI-NEXT: ret From 12c0823d67a8d5a61d6430aac609ef5e468267a6 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 3 Sep 2024 19:06:40 -0700 Subject: [PATCH 014/425] [clang-format] Handle spaces in file paths in git-clang-format.bat (#107041) This patch is provided by @jeliebig. Fixes #107017. --- clang/tools/clang-format/git-clang-format.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/tools/clang-format/git-clang-format.bat b/clang/tools/clang-format/git-clang-format.bat index 9965cd4312fe3..19c82d8a04132 100644 --- a/clang/tools/clang-format/git-clang-format.bat +++ b/clang/tools/clang-format/git-clang-format.bat @@ -1 +1 @@ -py -3 %~pn0 %* +py -3 "%~pn0" %* From a27ff17034d66d852ba83be7d237d6a623cb4ff4 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 3 Sep 2024 19:07:14 -0700 Subject: [PATCH 015/425] [clang-format] Fix a regression in annotating ObjCBlockLParen (#107021) Fixes #106994. --- clang/lib/Format/TokenAnnotator.cpp | 3 +-- clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 64e936f627d43..bf37062393719 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3588,8 +3588,7 @@ static FormatToken *getFunctionName(const AnnotatedLine &Line, // Make sure the name is followed by a pair of parentheses. if (Name) { - if (Tok->is(tok::l_paren) && Tok->isNot(TT_FunctionTypeLParen) && - Tok->MatchingParen) { + if (Tok->is(tok::l_paren) && Tok->is(TT_Unknown) && Tok->MatchingParen) { OpeningParen = Tok; return Name; } diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 5d37a65250d0b..b8a86245808e5 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1678,6 +1678,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCBlock) { "}();"); ASSERT_EQ(Tokens.size(), 19u) << Tokens; EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_ObjCBlockLBrace); + + Tokens = annotate("id (^block)(Foo *a) = ^id _Nullable(Foo *_Nullable a) {\n" + " return a;\n" + "};"); + ASSERT_EQ(Tokens.size(), 27u) << Tokens; + EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); // Not CtorDtorDeclName. + EXPECT_TOKEN(Tokens[1], tok::l_paren, TT_ObjCBlockLParen); } TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) { From b55186eefd73b3848e01c8471c47a9354969d652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Wed, 4 Sep 2024 04:07:35 +0200 Subject: [PATCH 016/425] [clang][Driver] Define soft float macros for PPC. (#106012) Fixes #105972. Co-authored-by: Qiu Chaofan --- clang/lib/Basic/Targets/PPC.cpp | 19 ++++++++++++++----- clang/lib/Basic/Targets/PPC.h | 2 ++ clang/test/Preprocessor/init-ppc.c | 18 ++++++++++++++++++ clang/test/Preprocessor/init-ppc64.c | 18 ++++++++++++++++++ 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index 04dc436eb1b9c..1448069173b5f 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -68,6 +68,10 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, HasSPE = true; LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); + } else if (Feature == "+frsqrte") { + HasFrsqrte = true; + } else if (Feature == "+frsqrtes") { + HasFrsqrtes = true; } else if (Feature == "-hard-float") { FloatABI = SoftFloat; } else if (Feature == "+paired-vector-memops") { @@ -402,9 +406,18 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__VEC__", "10206"); Builder.defineMacro("__ALTIVEC__"); } - if (HasSPE) { + if (HasSPE) Builder.defineMacro("__SPE__"); + if (HasSPE || FloatABI == SoftFloat) Builder.defineMacro("__NO_FPRS__"); + if (FloatABI == SoftFloat) { + Builder.defineMacro("_SOFT_FLOAT"); + Builder.defineMacro("_SOFT_DOUBLE"); + } else { + if (HasFrsqrte) + Builder.defineMacro("__RSQRTE__"); + if (HasFrsqrtes) + Builder.defineMacro("__RSQRTEF__"); } if (HasVSX) Builder.defineMacro("__VSX__"); @@ -439,14 +452,10 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, // FIXME: The following are not yet generated here by Clang, but are // generated by GCC: // - // _SOFT_FLOAT_ // __RECIP_PRECISION__ // __APPLE_ALTIVEC__ // __RECIP__ // __RECIPF__ - // __RSQRTE__ - // __RSQRTEF__ - // _SOFT_DOUBLE_ // __NO_LWSYNC__ // __CMODEL_MEDIUM__ // __CMODEL_LARGE__ diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 6d5d8dd54d013..b0833d30550af 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -73,6 +73,8 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool HasExtDiv = false; bool HasP9Vector = false; bool HasSPE = false; + bool HasFrsqrte = false; + bool HasFrsqrtes = false; bool PairedVectorMemops = false; bool HasP10Vector = false; bool HasPCRelativeMemops = false; diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c index 3fb642af9d742..1421b102a3dfd 100644 --- a/clang/test/Preprocessor/init-ppc.c +++ b/clang/test/Preprocessor/init-ppc.c @@ -977,3 +977,21 @@ // RUN: %clang_cc1 -E -dM -triple=powerpc-unknown-openbsd -x c++ < /dev/null | FileCheck -match-full-lines -check-prefix PPC-OPENBSD-CXX %s // PPC-OPENBSD-CXX: #define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL + +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR4-RSQRT %s +// +// PPCPWR4-RSQRT-NOT:#define __RSQRTEF__ 1 +// PPCPWR4-RSQRT-NOT:#define __RSQRTE__ 1 +// +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-feature +frsqrte -target-feature +frsqrtes < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5-RSQRT %s +// +// PPCPWR5-RSQRT:#define __RSQRTEF__ 1 +// PPCPWR5-RSQRT:#define __RSQRTE__ 1 + +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-feature -hard-float < /dev/null | FileCheck -match-full-lines -check-prefix PPC-SOFTFLT %s +// +// PPC-SOFTFLT:#define _SOFT_DOUBLE 1 +// PPC-SOFTFLT:#define _SOFT_FLOAT 1 +// PPC-SOFTFLT:#define __NO_FPRS__ 1 +// PPC-SOFTFLT-NOT:#define __RSQRTE__ 1 +// PPC-SOFTFLT-NOT:#define __RSQRTEF__ 1 diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index 56164beb913d5..57e2ca31d5d53 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -1110,3 +1110,21 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s // PPC64-FREEBSD-NOT: #define __LONG_DOUBLE_128__ 1 + +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix PPC64PWR4-RSQRT %s +// +// PPC64PWR4-RSQRT-NOT:#define __RSQRTEF__ 1 +// PPC64PWR4-RSQRT-NOT:#define __RSQRTE__ 1 +// +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +frsqrte -target-feature +frsqrtes < /dev/null | FileCheck -match-full-lines -check-prefix PPC64PWR5-RSQRT %s +// +// PPC64PWR5-RSQRT:#define __RSQRTEF__ 1 +// PPC64PWR5-RSQRT:#define __RSQRTE__ 1 + +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-feature -hard-float -xc /dev/null | FileCheck --check-prefix=PPC64-SOFTFLT %s +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-feature -hard-float -xc /dev/null | FileCheck --check-prefix=PPC64-SOFTFLT %s +// PPC64-SOFTFLT:#define _SOFT_DOUBLE 1 +// PPC64-SOFTFLT:#define _SOFT_FLOAT 1 +// PPC64-SOFTFLT:#define __NO_FPRS__ 1 +// PPC64-SOFTFLT-NOT:#define __RSQRTE__ 1 +// PPC64-SOFTFLT-NOT:#define __RSQRTEF__ 1 From 8d0816615f920b0783bafa903804b9e2a2fa4e91 Mon Sep 17 00:00:00 2001 From: yifeizh2 Date: Wed, 4 Sep 2024 10:10:43 +0800 Subject: [PATCH 017/425] [MLIR][Tensor] Fix source/dest type check in UnPackOp canonicalize (#106094) Fix `RankedTensorType` equality check in unpack op canonicalization. --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 6 +++--- mlir/test/Dialect/Tensor/canonicalize.mlir | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index e11c6aaccf74d..996de530c255d 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -4203,7 +4203,7 @@ static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, } LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { - // Fold an unpack(pack(x)) to x. + // Fold an pack(unpack(x)) to x. if (auto unPackOp = packOp.getSource().getDefiningOp()) { if (unPackOp.getSourceType() != packOp.getDestType()) return failure(); @@ -4437,9 +4437,9 @@ static bool inferStaticShape(UnPackOp op, SmallVectorImpl &srcShape, LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, PatternRewriter &rewriter) { - /// pack(unpack(x)) -> x + /// unpack(pack(x)) -> x if (PackOp packOp = unPackOp.getSource().getDefiningOp()) { - if (packOp.getDestType() != unPackOp.getSourceType()) + if (packOp.getSourceType() != unPackOp.getDestType()) return failure(); if (packOp.getPaddingValue() || !hasSameInnerOuterAttribute(packOp, unPackOp) || diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 458ff51be7462..735790e5bd6c5 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -2268,6 +2268,19 @@ func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> // ----- +// CHECK: func.func @unpack_pack_with_padding_no_canonicalization( +// CHECK: tensor.pack +// CHECK: tensor.unpack +func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> { + %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16> + %tensor_empty1 = tensor.empty() : tensor<224x512xbf16> + %packed = tensor.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16> + %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> + return %unpacked : tensor<224x512xbf16> +} + +// ----- + // Chain NCnc -> NC -> NC -> NCnc // CHECK: func.func @pack_unpack( // CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, From 812c96e8b9354e5e84d513f5b03172db5ad3b491 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 3 Sep 2024 19:11:12 -0700 Subject: [PATCH 018/425] [clang-format] Handle pointer/reference in macro definitions (#107074) A macro definition needs its own scope stack in the annotator, so we add the MacroBodyScopes stack and use ScopeStack to refer to it when in the macro definition body. Also, we need to have a scope type for a child block because its parent line is parsed (and thus the scope type for the braces is popped off the scope stack) before the lines in the child block are. Fixes #99271. --- clang/lib/Format/TokenAnnotator.cpp | 24 +++++++++++++------ clang/lib/Format/TokenAnnotator.h | 8 +++---- clang/unittests/Format/TokenAnnotatorTest.cpp | 15 ++++++++++++ 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index bf37062393719..6a1cf61659fd9 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -137,9 +137,8 @@ class AnnotatingParser { private: ScopeType getScopeType(const FormatToken &Token) const { switch (Token.getType()) { - case TT_FunctionLBrace: case TT_LambdaLBrace: - return ST_Function; + return ST_ChildBlock; case TT_ClassLBrace: case TT_StructLBrace: case TT_UnionLBrace: @@ -400,7 +399,8 @@ class AnnotatingParser { OpeningParen.Previous->MatchingParen->isOneOf( TT_ObjCBlockLParen, TT_FunctionTypeLParen)) { Contexts.back().IsExpression = false; - } else if (!Line.MustBeDeclaration && !Line.InPPDirective) { + } else if (!Line.MustBeDeclaration && + (!Line.InPPDirective || (Line.InMacroBody && !Scopes.empty()))) { bool IsForOrCatch = OpeningParen.Previous && OpeningParen.Previous->isOneOf(tok::kw_for, tok::kw_catch); @@ -3649,11 +3649,21 @@ static bool isCtorOrDtorName(const FormatToken *Tok) { } void TokenAnnotator::annotate(AnnotatedLine &Line) { - AnnotatingParser Parser(Style, Line, Keywords, Scopes); + if (!Line.InMacroBody) + MacroBodyScopes.clear(); + + auto &ScopeStack = Line.InMacroBody ? MacroBodyScopes : Scopes; + AnnotatingParser Parser(Style, Line, Keywords, ScopeStack); Line.Type = Parser.parseLine(); - for (auto &Child : Line.Children) - annotate(*Child); + if (!Line.Children.empty()) { + ScopeStack.push_back(ST_ChildBlock); + for (auto &Child : Line.Children) + annotate(*Child); + // ScopeStack can become empty if Child has an unmatched `}`. + if (!ScopeStack.empty()) + ScopeStack.pop_back(); + } // With very deep nesting, ExpressionParser uses lots of stack and the // formatting algorithm is very slow. We're not going to do a good job here @@ -3671,7 +3681,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) { if (IsCpp) { FormatToken *OpeningParen = nullptr; auto *Tok = getFunctionName(Line, OpeningParen); - if (Tok && ((!Scopes.empty() && Scopes.back() == ST_Class) || + if (Tok && ((!ScopeStack.empty() && ScopeStack.back() == ST_Class) || Line.endsWith(TT_FunctionLBrace) || isCtorOrDtorName(Tok))) { Tok->setFinalizedType(TT_CtorDtorDeclName); assert(OpeningParen); diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h index f4f2bba0eb217..5a02030e5ba7f 100644 --- a/clang/lib/Format/TokenAnnotator.h +++ b/clang/lib/Format/TokenAnnotator.h @@ -36,11 +36,11 @@ enum LineType { }; enum ScopeType { + // Contained in child block. + ST_ChildBlock, // Contained in class declaration/definition. ST_Class, - // Contained within function definition. - ST_Function, - // Contained within other scope block (loop, if/else, etc). + // Contained within other scope block (function, loop, if/else, etc). ST_Other, }; @@ -269,7 +269,7 @@ class TokenAnnotator { const AdditionalKeywords &Keywords; - SmallVector Scopes; + SmallVector Scopes, MacroBodyScopes; }; } // end namespace format diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index b8a86245808e5..a2986f589396b 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -327,6 +327,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) { ASSERT_EQ(Tokens.size(), 26u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_BinaryOperator); EXPECT_TOKEN(Tokens[16], tok::ampamp, TT_BinaryOperator); + + Tokens = annotate("#define FOO \\\n" + " void foo() { f(a * b); }"); + ASSERT_EQ(Tokens.size(), 17u) << Tokens; + EXPECT_TOKEN(Tokens[11], tok::star, TT_BinaryOperator); + + Tokens = annotate("#define FOO auto Foo = [] { f(a * b); };"); + ASSERT_EQ(Tokens.size(), 19u) << Tokens; + EXPECT_TOKEN(Tokens[12], tok::star, TT_BinaryOperator); + + Tokens = annotate("namespace {\n" + "#define FOO(x) void foo(a##x *b);\n" + "}"); + ASSERT_EQ(Tokens.size(), 20u) << Tokens; + EXPECT_TOKEN(Tokens[14], tok::star, TT_PointerOrReference); } TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) { From f4b9839d6f7c9ec2967a42f2d5546a2a2ae77ca4 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 4 Sep 2024 10:21:27 +0800 Subject: [PATCH 019/425] [mlir][TensorToSPIRV] Add type check for `tensor.extract` in TensorToSPIRV (#107110) This patch add a type check for `tensor.extract` in TensorToSPIRV. Only convert `tensor.extract` with supported element type. Fix #74466. --- .../Conversion/TensorToSPIRV/TensorToSPIRV.cpp | 2 ++ .../TensorToSPIRV/tensor-ops-to-spirv.mlir | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp index 0fb58623bdafb..468fffdd2df91 100644 --- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp +++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp @@ -45,6 +45,8 @@ class TensorExtractPattern final ConversionPatternRewriter &rewriter) const override { auto tensorType = cast(extractOp.getTensor().getType()); + if (!isa(tensorType.getElementType())) + return rewriter.notifyMatchFailure(extractOp, "unsupported type"); if (!tensorType.hasStaticShape()) return rewriter.notifyMatchFailure(extractOp, "non-static tensor"); diff --git a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir index 32d0fbea65b16..b69c2d0408d17 100644 --- a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir +++ b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir @@ -29,6 +29,24 @@ func.func @tensor_extract_constant(%a : index, %b: index, %c: index) -> i32 { // ----- +// CHECK-LABEL: test_spirv_unsupported_type_index +func.func @test_spirv_unsupported_type_index(%a : index) { + %cst = arith.constant dense<[1, 2]> : tensor<2xindex> + // CHECK: tensor.extract + %extract = tensor.extract %cst[%a] : tensor<2xindex> + return +} + +// CHECK-LABEL: test_spirv_unsupported_type_i128 +func.func @test_spirv_unsupported_type_i128(%a : index) { + %cst = arith.constant dense<[1, 2]> : tensor<2xi128> + // CHECK: tensor.extract + %extract = tensor.extract %cst[%a] : tensor<2xi128> + return +} + +// ----- + //===----------------------------------------------------------------------===// // Type conversion //===----------------------------------------------------------------------===// From 37263b6c6741894ffbc0f61979c5c85db515ef2d Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 4 Sep 2024 10:24:17 +0800 Subject: [PATCH 020/425] [mlir][tosa] Add verifier for `tosa.pad` (#106351) This patch adds verifier to `tosa.pad` which fixes a crash. `tosa.pad` expect: - same input and output tensor rank. - 'padding' tensor rank equal to 2. Fix #106168. --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 1 + mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 14 +++++++++++ mlir/test/Dialect/Tosa/invalid.mlir | 25 ++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 0be0f8ef2d7a0..1a132e73be864 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1594,6 +1594,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> { let hasCanonicalizer = 1; let hasFolder = 1; + let hasVerifier = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 267a875710ed7..d93db1b237f31 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -817,6 +817,20 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents( return success(); } +LogicalResult tosa::PadOp::verify() { + RankedTensorType inputType = getInput1().getType(); + RankedTensorType outputType = getOutput().getType(); + TensorType paddingType = getPadding().getType(); + + if (inputType.getRank() != outputType.getRank()) + return emitOpError() << "expect same input and output tensor rank."; + + if (paddingType.hasRank() && paddingType.getRank() != 2) + return emitOpError() << "expect 'padding' tensor rank equal to 2."; + + return success(); +} + static SmallVector convertToMlirShape(ArrayRef shape) { return to_vector(llvm::map_range(shape, [](int64_t dim) { return dim == -1 ? ShapedType::kDynamic : dim; diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index e72e154f95277..418f7687b3cce 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -72,6 +72,31 @@ func.func @test_pad_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor) -> t // ----- +func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) { + // expected-error@+1 {{'tosa.pad' op expect same input and output tensor rank.}} + %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21x3xf32> + return +} + +// ----- + +func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2xi32>) { + // expected-error@+1 {{'tosa.pad' op expect 'padding' tensor rank equal to 2.}} + %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2xi32>) -> tensor<13x21xf32> + return +} + +// ----- + +func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) { + %0 = "tosa.const"() {value = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32> + // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<1xf32>'}} + %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<2x2xi32>, tensor<1xf32>) -> tensor<13x21xf32> + return +} + +// ----- + func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> { // expected-error@+1 {{'tosa.transpose' op perms of transpose is not constant}} %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32> From a628bc3c2e7314e4b7c9af0d10cf39a70c731d15 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 3 Sep 2024 19:55:08 -0700 Subject: [PATCH 021/425] [AArch64] Fix a warning This patch fixes: lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc:506:14: error: unused variable 'GIMatchData_matchinfo' [-Werror,-Wunused-variable] --- llvm/lib/Target/AArch64/AArch64Combine.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index f99d1e276c60f..25989fb598895 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -223,7 +223,7 @@ def build_vector_to_dup : GICombineRule< >; def build_vector_to_vector_insert : GICombineRule< - (defs root:$root, register_matchinfo:$matchinfo), + (defs root:$root), (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root, [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]), (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }]) From 9a17a6016d02afa6e973f141ab1cada68571f2d2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 3 Sep 2024 20:02:18 -0700 Subject: [PATCH 022/425] [PowerPC] Use DenseMap::operator[] (NFC) (#107044) --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index a57ed33bda9c7..f7188b856461b 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2424,8 +2424,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( // or two GPRs, so we need table to record information for later save/restore. for (const CalleeSavedInfo &Info : CSI) { if (Info.isSpilledToReg()) { - auto &SpilledVSR = - VSRContainingGPRs.FindAndConstruct(Info.getDstReg()).second; + auto &SpilledVSR = VSRContainingGPRs[Info.getDstReg()]; assert(SpilledVSR.second == 0 && "Can't spill more than two GPRs into VSR!"); if (SpilledVSR.first == 0) From f15e3e58c59b4d31eee24fa9debc5dfad0c20028 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 3 Sep 2024 20:02:29 -0700 Subject: [PATCH 023/425] [CGOpenMPRuntime] Use DenseMap::operator[] (NFC) (#107158) I'm planning to deprecate DenseMap::FindAndConstruct in favor of DenseMap::operator[]. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 34120486996fb..23b977be81602 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1149,10 +1149,8 @@ void CGOpenMPRuntime::emitUserDefinedReduction( /*IsCombiner=*/false); } UDRMap.try_emplace(D, Combiner, Initializer); - if (CGF) { - auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn); - Decls.second.push_back(D); - } + if (CGF) + FunctionUDRMap[CGF->CurFn].push_back(D); } std::pair @@ -1432,10 +1430,8 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF, ThreadID = CGF.EmitLoadOfScalar(LVal, Loc); // If value loaded in entry block, cache it and use it everywhere in // function. - if (CGF.Builder.GetInsertBlock() == TopBlock) { - auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); - Elem.second.ThreadID = ThreadID; - } + if (CGF.Builder.GetInsertBlock() == TopBlock) + OpenMPLocThreadIDMap[CGF.CurFn].ThreadID = ThreadID; return ThreadID; } } @@ -8640,8 +8636,7 @@ class MappableExprsHandler { const MapData &BaseData = CI == CE ? L : L1; OMPClauseMappableExprCommon::MappableExprComponentListRef SubData = SI == SE ? Components : Components1; - auto &OverlappedElements = OverlappedData.FindAndConstruct(&BaseData); - OverlappedElements.getSecond().push_back(SubData); + OverlappedData[&BaseData].push_back(SubData); } } } @@ -9316,10 +9311,8 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D, MapperCGF.EmitBlock(DoneBB, /*IsFinished=*/true); MapperCGF.FinishFunction(); UDMMap.try_emplace(D, Fn); - if (CGF) { - auto &Decls = FunctionUDMMap.FindAndConstruct(CGF->CurFn); - Decls.second.push_back(D); - } + if (CGF) + FunctionUDMMap[CGF->CurFn].push_back(D); } /// Emit the array initialization or deletion portion for user-defined mapper From 86627149f6fd5148311b7b0aa1c7195a05a5d6a8 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 4 Sep 2024 12:15:20 +0900 Subject: [PATCH 024/425] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (#100067) Any SGPR read by a VALU can potentially obscure SALU writes to the same register. Insert s_wait_alu instructions to mitigate the hazard on affected paths. Compute a global cache of SGPRs with any VALU reads and use this to avoid inserting mitigation for SGPRs never accessed by VALUs. To avoid excessive search when compile time is priority implement secondary mode where all SALU writes are mitigated. Co-authored-by: Shilei Tian --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 288 +++++- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 24 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 24 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 2 + .../AMDGPU/atomic_optimizations_buffer.ll | 68 +- .../atomic_optimizations_global_pointer.ll | 96 +- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 57 +- .../atomic_optimizations_struct_buffer.ll | 57 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 125 ++- .../buffer-fat-pointer-atomicrmw-fmax.ll | 167 +++- .../buffer-fat-pointer-atomicrmw-fmin.ll | 167 +++- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 104 ++- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 176 +++- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 176 +++- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 232 +++-- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 18 +- llvm/test/CodeGen/AMDGPU/fmaximum.ll | 3 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 4 +- llvm/test/CodeGen/AMDGPU/fminimum.ll | 3 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 4 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 106 ++- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 176 +++- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 176 +++- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 232 +++-- .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 48 +- .../hazard-recognizer-src-shared-base.ll | 23 + .../AMDGPU/indirect-call-known-callees.ll | 15 +- .../insert_waitcnt_for_precise_memory.ll | 43 +- .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 6 + .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 44 +- ...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 10 +- .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 19 +- ...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 20 +- ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 20 +- ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 20 +- ...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 16 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 2 + llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 2 + llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 46 +- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 37 +- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 1 + llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 108 ++- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 113 ++- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 80 +- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 80 +- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 120 ++- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 3 +- .../lower-work-group-id-intrinsics-hsa.ll | 5 +- .../lower-work-group-id-intrinsics-pal.ll | 5 +- .../materialize-frame-index-sgpr.gfx10.ll | 34 + .../AMDGPU/materialize-frame-index-sgpr.ll | 17 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 156 ++++ .../AMDGPU/memory-legalizer-flat-lastuse.ll | 8 + .../memory-legalizer-flat-nontemporal.ll | 26 + .../memory-legalizer-flat-singlethread.ll | 156 ++++ .../AMDGPU/memory-legalizer-flat-system.ll | 156 ++++ .../AMDGPU/memory-legalizer-flat-volatile.ll | 26 + .../AMDGPU/memory-legalizer-flat-wavefront.ll | 154 ++++ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 148 +++ .../AMDGPU/memory-legalizer-global-agent.ll | 150 +++ .../AMDGPU/memory-legalizer-global-lastuse.ll | 8 + .../memory-legalizer-global-nontemporal.ll | 18 + .../memory-legalizer-global-singlethread.ll | 152 +++ .../AMDGPU/memory-legalizer-global-system.ll | 142 +++ .../memory-legalizer-global-volatile.ll | 20 + .../memory-legalizer-global-wavefront.ll | 152 +++ .../memory-legalizer-global-workgroup.ll | 152 +++ .../memory-legalizer-invalid-syncscope.ll | 1 + .../AMDGPU/memory-legalizer-local-agent.ll | 120 +++ .../memory-legalizer-local-nontemporal.ll | 16 + .../memory-legalizer-local-singlethread.ll | 120 +++ .../AMDGPU/memory-legalizer-local-system.ll | 120 +++ .../AMDGPU/memory-legalizer-local-volatile.ll | 14 + .../memory-legalizer-local-wavefront.ll | 120 +++ .../memory-legalizer-local-workgroup.ll | 120 +++ .../memory-legalizer-private-lastuse.ll | 6 + .../memory-legalizer-private-nontemporal.ll | 18 + .../memory-legalizer-private-volatile.ll | 16 + .../AMDGPU/pseudo-scalar-transcendental.ll | 37 +- llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 8 +- llvm/test/CodeGen/AMDGPU/v_swap_b16.ll | 10 +- .../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 168 ++-- .../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 862 ++++++++++++++++++ .../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir | 5 +- 90 files changed, 5924 insertions(+), 915 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll create mode 100644 llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 2c1071c543305..cc39fd1740683 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -14,6 +14,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -44,6 +45,10 @@ static cl::opt cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops.")); +static cl::opt MaxExhaustiveHazardSearch( + "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, + cl::desc("Maximum function size for exhausive hazard search")); + //===----------------------------------------------------------------------===// // Hazard Recognizer Implementation //===----------------------------------------------------------------------===// @@ -51,15 +56,11 @@ static cl::opt static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST); -GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : - IsHazardRecognizerMode(false), - CurrCycleInstr(nullptr), - MF(MF), - ST(MF.getSubtarget()), - TII(*ST.getInstrInfo()), - TRI(TII.getRegisterInfo()), - ClauseUses(TRI.getNumRegUnits()), - ClauseDefs(TRI.getNumRegUnits()) { +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) + : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), + ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), + TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false), + ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; TSchedModel.init(&ST); RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); @@ -1195,6 +1196,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); + fixVALUReadSGPRHazard(MI); fixRequiredExportPriority(MI); } @@ -3010,6 +3012,274 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return true; } +// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. +// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc +static std::optional sgprPairNumber(Register Reg, + const SIRegisterInfo &TRI) { + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + return {}; + default: + break; + } + unsigned RegN = TRI.getEncodingValue(Reg); + if (RegN > 127) + return {}; + return (RegN >> 1) & 0x3f; +} + +// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. +void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { + assert(MMF == &MF); + + // Assume non-empty vector means it has already been computed. + if (!VALUReadHazardSGPRs.empty()) + return; + + auto CallingConv = MF.getFunction().getCallingConv(); + bool IsCallFree = + AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); + + // Exhaustive search is only viable in non-caller/callee functions where + // VALUs will be exposed to the hazard recognizer. + UseVALUReadHazardExhaustiveSearch = + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && + MF.getInstructionCount() <= MaxExhaustiveHazardSearch; + + // Consider all SGPRs hazards if the shader uses function calls or is callee. + bool UseVALUUseCache = + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; + VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); + if (!UseVALUUseCache) + return; + + // Perform a post ordered reverse scan to find VALUs which read an SGPR + // before a SALU write to the same SGPR. This provides a reduction in + // hazard insertion when all VALU access to an SGPR occurs after its last + // SALU write, when compared to a linear scan. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + BitVector SALUWriteSGPRs(64), ReadSGPRs(64); + MachineCycleInfo CI; + CI.compute(*MMF); + + for (auto *MBB : post_order(&MF)) { + bool InCycle = CI.getCycle(MBB) != nullptr; + for (auto &MI : reverse(MBB->instrs())) { + bool IsVALU = SIInstrInfo::isVALU(MI); + bool IsSALU = SIInstrInfo::isSALU(MI); + if (!IsVALU && !IsSALU) + continue; + + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + assert(!Op.getSubReg()); + // Only consider implicit operands of VCC. + if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || + Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) + continue; + if (!TRI.isSGPRReg(MRI, Reg)) + continue; + auto RegN = sgprPairNumber(Reg, TRI); + if (!RegN) + continue; + if (IsVALU && Op.isUse()) { + // Note: any access within a cycle must be considered a hazard. + if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) + VALUReadHazardSGPRs.set(*RegN); + ReadSGPRs.set(*RegN); + } else if (IsSALU) { + if (Op.isDef()) + SALUWriteSGPRs.set(*RegN); + else + ReadSGPRs.set(*RegN); + } + } + } + } +} + +bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { + if (!ST.hasVALUReadSGPRHazard()) + return false; + + // The hazard sequence is fundamentally three instructions: + // 1. VALU reads SGPR + // 2. SALU writes SGPR + // 3. VALU/SALU reads SGPR + // Try to avoid searching for (1) because the expiry point of the hazard is + // indeterminate; however, the hazard between (2) and (3) can expire if the + // gap contains sufficient SALU instructions with no usage of SGPR from (1). + // Note: SGPRs must be considered as 64-bit pairs as hazard exists + // even if individual SGPRs are accessed. + + bool MIIsSALU = SIInstrInfo::isSALU(*MI); + bool MIIsVALU = SIInstrInfo::isVALU(*MI); + if (!(MIIsSALU || MIIsVALU)) + return false; + + // Avoid expensive search when compile time is priority by + // mitigating every SALU which writes an SGPR. + if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { + if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) + return false; + + const MachineOperand *SDSTOp = + TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); + if (!SDSTOp || !SDSTOp->isReg()) + return false; + + const Register HazardReg = SDSTOp->getReg(); + if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || + HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) + return false; + + // Add s_wait_alu sa_sdst(0) after SALU write. + auto NextMI = std::next(MI->getIterator()); + auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + + // SALU write may be s_getpc in a bundle. + updateGetPCBundle(NewMI); + + return true; + } + + // Pre-compute set of SGPR pairs read by VALUs. + // Note: pass mutable pointer to MachineFunction for CycleInfo. + computeVALUHazardSGPRs(MI->getMF()); + + // If no VALUs hazard SGPRs exist then nothing to do. + if (VALUReadHazardSGPRs.none()) + return false; + + // All SGPR writes before a call/return must be flushed as the callee/caller + // will not will not see the hazard chain, i.e. (2) to (3) described above. + const bool IsSetPC = (MI->isCall() || MI->isReturn()) && + !(MI->getOpcode() == AMDGPU::S_ENDPGM || + MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); + + // Collect all SGPR sources for MI which are read by a VALU. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallSet SGPRsUsed; + + if (!IsSetPC) { + for (const MachineOperand &Op : MI->all_uses()) { + Register OpReg = Op.getReg(); + + // Only consider VCC implicit uses on VALUs. + // The only expected SALU implicit access is SCC which is no hazard. + if (MIIsSALU && Op.isImplicit()) + continue; + + if (!TRI.isSGPRReg(MRI, OpReg)) + continue; + + auto RegN = sgprPairNumber(OpReg, TRI); + if (!RegN) + continue; + + if (!VALUReadHazardSGPRs[*RegN]) + continue; + + SGPRsUsed.insert(OpReg); + } + + // No SGPRs -> nothing to do. + if (SGPRsUsed.empty()) + return false; + } + + // A hazard is any SALU which writes one of the SGPRs read by MI. + auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { + if (!SIInstrInfo::isSALU(I)) + return false; + // Ensure SGPR flush before call/return by conservatively assuming every + // SALU writes an SGPR. + if (IsSetPC && I.getNumDefs() > 0) + return true; + // Check for any register writes. + return any_of(SGPRsUsed, [this, &I](Register Reg) { + return I.modifiesRegister(Reg, &TRI); + }); + }; + + const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; + auto IsExpiredFn = [&](const MachineInstr &I, int Count) { + if (Count >= SALUExpiryCount) + return true; + // s_wait_alu sa_sdst(0) on path mitigates hazard. + if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) + return true; + return false; + }; + + auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { + // Only count true SALUs as wait states. + if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) + return 0; + // SALU must be unrelated to any hazard registers. + if (any_of(SGPRsUsed, + [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) + return 0; + return 1; + }; + + // Check for the hazard. + DenseSet Visited; + int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, WaitStatesFn); + + if (WaitStates >= SALUExpiryCount) + return false; + + // Validate hazard through an exhaustive search. + if (UseVALUReadHazardExhaustiveSearch) { + // A hazard is any VALU which reads one of the paired SGPRs read by MI. + // This is searching for (1) in the hazard description. + auto hazardPair = [this](Register Reg) { + if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) + return Register(AMDGPU::VCC); + auto RegN = sgprPairNumber(Reg, TRI); + return Register(AMDGPU::SGPR0_SGPR1 + *RegN); + }; + auto SearchHazardFn = [this, hazardPair, + &SGPRsUsed](const MachineInstr &I) { + if (!SIInstrInfo::isVALU(I)) + return false; + // Check for any register reads. + return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { + return I.readsRegister(hazardPair(Reg), &TRI); + }); + }; + auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { + return false; + }; + if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == + std::numeric_limits::max()) + return false; + } + + // Add s_wait_alu sa_sdst(0) before SALU read. + auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + + // SALU read may be after s_getpc in a bundle. + updateGetPCBundle(NewMI); + + return true; +} + static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII) { MachineBasicBlock &EntryMBB = MF->front(); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index f2a64ab48e180..e840e2445188f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -48,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const SIRegisterInfo &TRI; TargetSchedModel TSchedModel; bool RunLdsBranchVmemWARHazardFixup; + BitVector VALUReadHazardSGPRs; + bool UseVALUReadHazardExhaustiveSearch; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -107,6 +109,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); + void computeVALUHazardSGPRs(MachineFunction *MMF); + bool fixVALUReadSGPRHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 9386bcf0d74b2..7b74eab96c567 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1247,6 +1247,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } + bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 61d2c854dffa5..df81b926bceb3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -623,13 +623,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -779,12 +781,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 83be67a9138f6..53d9bf0751a1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -623,13 +623,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -779,12 +781,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 04833eaaa3283..50d40368dd107 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1404,6 +1404,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX12-NEXT: s_mov_b32 s2, 3 ; GFX12-NEXT: s_mov_b32 s1, 2 ; GFX12-NEXT: s_mov_b32 s0, 1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1494,6 +1495,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX12-NEXT: s_mov_b32 s2, 3 ; GFX12-NEXT: s_mov_b32 s1, 2 ; GFX12-NEXT: s_mov_b32 s0, 1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 17fe3adc22169..5cf9c9faa693e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -226,6 +226,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -233,8 +234,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -263,13 +265,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -509,6 +514,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -517,8 +523,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: @@ -547,10 +554,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: @@ -559,6 +568,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -878,7 +888,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -899,6 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -924,22 +935,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: @@ -948,6 +962,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1289,7 +1304,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -1312,6 +1327,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_clause 0x1 ; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 @@ -1338,22 +1354,25 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: @@ -1364,6 +1383,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1719,6 +1739,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1726,8 +1747,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -1757,13 +1779,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -2006,6 +2031,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -2014,8 +2040,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: @@ -2045,10 +2072,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: @@ -2378,7 +2407,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -2399,6 +2428,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -2424,22 +2454,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: @@ -2448,6 +2481,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 16f3ff4be6b50..ed036a83b6143 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -266,6 +266,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -273,8 +274,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 @@ -307,10 +310,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB0_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -320,6 +326,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB0_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 @@ -597,6 +604,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[0:1], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -640,6 +648,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -1012,7 +1021,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 @@ -1033,6 +1042,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -1064,23 +1074,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: @@ -1095,6 +1108,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB2_4: +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 @@ -1520,6 +1534,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1527,12 +1542,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 @@ -1550,6 +1567,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB2_2: +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 @@ -1595,8 +1613,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -1604,6 +1622,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 @@ -1900,6 +1919,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1910,6 +1930,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -1945,10 +1966,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -2288,6 +2312,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2298,6 +2323,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 ; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 @@ -2336,6 +2362,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -2349,6 +2376,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 @@ -2763,7 +2791,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 @@ -2785,6 +2813,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -2819,8 +2848,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 @@ -2828,6 +2858,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -3594,6 +3625,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -3601,6 +3633,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec @@ -3619,6 +3652,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB5_2: +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 @@ -3698,6 +3732,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -3715,6 +3750,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB5_2: +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 @@ -3989,6 +4025,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -3996,8 +4033,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 @@ -4031,10 +4070,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB6_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -4044,6 +4086,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB6_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 @@ -4326,6 +4369,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[0:1], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -4370,6 +4414,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -4743,7 +4788,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 @@ -4764,6 +4809,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -4795,23 +4841,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: @@ -4826,6 +4875,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB8_4: +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 @@ -5251,6 +5301,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5258,12 +5309,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 @@ -5281,6 +5334,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB8_2: +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 @@ -5326,8 +5380,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -5335,6 +5389,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 @@ -5646,6 +5701,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5656,6 +5712,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -5694,10 +5751,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -6053,6 +6113,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6063,6 +6124,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 ; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 @@ -6105,6 +6167,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -6118,6 +6181,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 @@ -6536,7 +6600,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 @@ -6558,6 +6622,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -6592,8 +6657,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 @@ -6601,6 +6667,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7367,6 +7434,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7374,6 +7442,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec @@ -7392,6 +7461,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB11_2: +; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 @@ -7471,6 +7541,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -7488,6 +7559,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB11_2: +; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 3a2efadac067d..9d4dfd8911257 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -225,6 +225,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -232,8 +233,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -262,13 +264,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -508,6 +513,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -516,8 +522,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: @@ -546,10 +553,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: @@ -558,6 +567,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -877,7 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -898,6 +908,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -923,22 +934,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: @@ -947,6 +961,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1302,6 +1317,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1309,8 +1325,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -1340,13 +1357,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1589,6 +1609,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1597,8 +1618,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: @@ -1628,10 +1650,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: @@ -1961,7 +1985,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -1982,6 +2006,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -2007,22 +2032,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: @@ -2031,6 +2059,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index d0c0b62c78e42..3fb44e090c61f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -232,6 +232,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -240,8 +241,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -270,13 +272,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -523,6 +528,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -532,8 +538,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: @@ -562,10 +569,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: @@ -574,6 +583,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -899,7 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -921,6 +931,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -946,22 +957,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: @@ -970,6 +984,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1466,6 +1481,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1474,8 +1490,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -1505,13 +1522,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1761,6 +1781,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1770,8 +1791,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: @@ -1801,10 +1823,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: @@ -2140,7 +2164,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 @@ -2162,6 +2186,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -2187,22 +2212,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: @@ -2211,6 +2239,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 63cdd8a3bb16d..e195026c13d27 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -436,13 +436,16 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -450,6 +453,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2049,7 +2053,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -2067,12 +2071,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2323,7 +2329,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 @@ -2340,12 +2346,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2596,12 +2604,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: @@ -2626,11 +2637,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 @@ -2640,11 +2654,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB10_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3114,7 +3129,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -3132,12 +3147,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3408,7 +3425,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -3426,12 +3443,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3685,14 +3704,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3705,6 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 @@ -3714,12 +3737,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4055,14 +4079,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4075,6 +4102,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 @@ -4084,11 +4112,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4430,11 +4459,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4464,11 +4496,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4478,12 +4513,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5056,14 +5092,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start @@ -5084,6 +5123,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 @@ -5093,12 +5133,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5481,14 +5522,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start @@ -5509,6 +5553,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 @@ -5518,11 +5563,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5910,11 +5956,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -5954,11 +6003,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -5968,12 +6020,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7127,13 +7180,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7141,6 +7197,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9585,13 +9642,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: @@ -9599,6 +9659,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c90296124eb12..c7569a6c155db 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1734,12 +1742,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1767,11 +1778,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1781,11 +1795,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2150,7 +2165,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -2170,12 +2185,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2461,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -2481,12 +2498,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2681,14 +2700,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2701,8 +2723,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 @@ -2712,12 +2735,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3067,14 +3091,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3087,8 +3114,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 @@ -3098,11 +3126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3457,11 +3486,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3495,11 +3527,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3509,12 +3544,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4100,14 +4136,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -4128,6 +4167,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 @@ -4137,12 +4177,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4527,14 +4568,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4555,6 +4599,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 @@ -4564,11 +4609,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4958,11 +5004,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -5002,11 +5051,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -5016,12 +5068,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5643,8 +5696,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -5662,12 +5716,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5987,6 +6043,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 @@ -6004,12 +6061,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6329,12 +6388,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6362,11 +6424,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6376,12 +6441,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6937,8 +7003,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -6972,12 +7038,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7382,7 +7450,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 @@ -7414,12 +7482,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7823,12 +7893,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7871,11 +7944,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7885,12 +7961,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 91adbfa559976..9f97d2033bbb5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1734,12 +1742,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1767,11 +1778,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1781,11 +1795,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2150,7 +2165,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -2170,12 +2185,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2461,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 @@ -2481,12 +2498,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2681,14 +2700,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2701,8 +2723,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 @@ -2712,12 +2735,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3067,14 +3091,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3087,8 +3114,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 @@ -3098,11 +3126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3457,11 +3486,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3495,11 +3527,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3509,12 +3544,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4100,14 +4136,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -4128,6 +4167,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 @@ -4137,12 +4177,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4527,14 +4568,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4555,6 +4599,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 @@ -4564,11 +4609,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4958,11 +5004,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -5002,11 +5051,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -5016,12 +5068,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5643,8 +5696,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -5662,12 +5716,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5987,6 +6043,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 @@ -6004,12 +6061,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6329,12 +6388,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6362,11 +6424,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6376,12 +6441,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6937,8 +7003,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -6972,12 +7038,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7382,7 +7450,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 @@ -7414,12 +7482,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7823,12 +7893,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7871,11 +7944,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7885,12 +7961,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 422c8a0be23b4..1ae1204e3cde1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5707,13 +5707,15 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -5888,13 +5890,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6070,13 +6074,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6263,12 +6269,14 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6432,12 +6440,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6608,12 +6618,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6816,13 +6828,15 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -7105,13 +7119,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7403,13 +7419,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7700,12 +7718,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7978,12 +7998,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8265,12 +8287,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8541,12 +8565,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8756,13 +8782,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8990,13 +9018,15 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9289,12 +9319,14 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9590,13 +9622,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9933,13 +9967,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10286,13 +10322,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10639,12 +10677,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10981,12 +11021,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11313,13 +11355,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11596,12 +11640,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11882,12 +11928,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12215,13 +12263,15 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12569,12 +12619,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index ea2427a3c420f..ed78f4a071e3d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 2767b66e44703..bdb945a652eb2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 6672f16c4a7a8..c7f2bf6d1b317 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -34,13 +34,15 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32: @@ -229,13 +231,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -428,13 +432,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -644,12 +650,14 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -829,12 +837,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1021,12 +1031,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1232,13 +1244,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1432,12 +1446,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1631,13 +1647,15 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: @@ -1826,13 +1844,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2025,13 +2045,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2241,12 +2263,14 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2426,12 +2450,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2618,12 +2644,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2829,13 +2857,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3029,12 +3059,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3228,13 +3260,15 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: @@ -3439,13 +3473,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -3651,13 +3687,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3877,12 +3915,14 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -4072,12 +4112,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4274,12 +4316,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4513,13 +4557,15 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -4802,13 +4848,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5100,13 +5148,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -5397,12 +5447,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5675,12 +5727,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5962,12 +6016,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6239,13 +6295,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -6460,12 +6518,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6687,13 +6747,15 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -6986,12 +7048,14 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7287,13 +7351,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -7630,13 +7696,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -7983,13 +8051,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -8334,12 +8404,14 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8666,12 +8738,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -9008,12 +9082,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9340,13 +9416,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -9623,12 +9701,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9912,13 +9992,15 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -10266,12 +10348,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10591,13 +10675,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16: @@ -10809,13 +10895,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -11030,13 +11118,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11268,12 +11358,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11475,12 +11567,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11689,12 +11783,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11922,13 +12018,15 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12144,12 +12242,14 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12385,13 +12485,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -12730,13 +12832,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -13078,13 +13182,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13443,12 +13549,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13777,12 +13885,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14118,12 +14228,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14478,13 +14590,15 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -14827,12 +14941,14 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 9a9fd289e2d0c..9653f8fdacac6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -265,10 +265,11 @@ define void @zero_init_foo() { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -354,10 +355,11 @@ define void @zero_init_foo() { ; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 ; GFX12-PAL-NEXT: s_mov_b32 s2, s0 ; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 @@ -1315,10 +1317,11 @@ define void @zero_init_small_offset_foo() { ; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -1414,10 +1417,11 @@ define void @zero_init_small_offset_foo() { ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 ; GFX12-PAL-NEXT: s_mov_b32 s2, s0 ; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 @@ -2526,10 +2530,11 @@ define void @zero_init_large_offset_foo() { ; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -2666,10 +2671,11 @@ define void @zero_init_large_offset_foo() { ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 ; GFX12-PAL-NEXT: s_mov_b32 s2, s0 ; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 3b7009023b03a..04cd150d93176 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2 ; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 9ce1ba3316dd5..27282a453075b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -462,7 +462,7 @@ define float @v_fmaximum3_f32_const1_const2(float %a) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0x41000000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1409,7 +1409,7 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_movk_i32 s0, 0x4800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 817e6dd87361f..3271758f71297 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2 ; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 21074d58bdb7e..d9ba2de48bb01 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -462,7 +462,7 @@ define float @v_fminimum3_f32_const1_const2(float %a) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0x41000000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1409,7 +1409,7 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_movk_i32 s0, 0x4800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 064238c63717e..361cc1e9e6c1d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -6963,13 +6963,15 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7184,13 +7186,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7406,13 +7410,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7635,12 +7641,14 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7838,12 +7846,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8044,12 +8054,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8277,13 +8289,15 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8616,13 +8630,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8966,13 +8982,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9315,12 +9333,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9642,12 +9662,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9979,12 +10001,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10306,13 +10330,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10566,12 +10592,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10830,13 +10858,15 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11181,12 +11211,14 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11532,13 +11564,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -11925,13 +11959,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12330,13 +12366,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12733,12 +12771,14 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13114,12 +13154,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13506,12 +13548,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13888,13 +13932,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14210,12 +14256,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14536,13 +14584,15 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB62_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14942,12 +14992,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB63_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23072,8 +23124,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 06d971febd038..84003a0432f7e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 65df8f07fb8b3..2aad91cd1071f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 79aa69771f84b..2e3799e1714af 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -35,13 +35,15 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32: @@ -266,13 +268,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -499,13 +503,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -741,12 +747,14 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32: @@ -961,12 +969,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1184,12 +1194,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1416,13 +1428,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1650,12 +1664,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1880,13 +1896,15 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz: @@ -2111,13 +2129,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2344,13 +2364,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2586,12 +2608,14 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2806,12 +2830,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3029,12 +3055,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3261,13 +3289,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3495,12 +3525,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3725,13 +3757,15 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64: @@ -3976,13 +4010,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4228,13 +4264,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4487,12 +4525,14 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4716,12 +4756,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4948,12 +4990,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5207,13 +5251,15 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5546,13 +5592,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5896,13 +5944,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6245,12 +6295,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6572,12 +6624,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6909,12 +6963,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7236,13 +7292,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7496,12 +7554,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7760,13 +7820,15 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8111,12 +8173,14 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8462,13 +8526,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -8855,13 +8921,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9260,13 +9328,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -9663,12 +9733,14 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -10044,12 +10116,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10436,12 +10510,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10818,13 +10894,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -11140,12 +11218,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11466,13 +11546,15 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -11872,12 +11954,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12247,13 +12331,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -12522,13 +12608,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12799,13 +12887,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -13079,12 +13169,14 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13341,12 +13433,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13606,12 +13700,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13880,13 +13976,15 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14158,12 +14256,14 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14450,13 +14550,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -14848,13 +14950,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -15248,13 +15352,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -15651,12 +15757,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -16036,12 +16144,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16424,12 +16534,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16821,13 +16933,15 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17222,12 +17336,14 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index d1a371fc4356f..24fd709514b47 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -280,7 +280,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -571,7 +571,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -864,7 +864,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -1155,7 +1155,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -1739,7 +1739,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -2287,7 +2287,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -2562,7 +2562,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -2835,7 +2835,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -3383,7 +3383,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -3658,7 +3658,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -3931,7 +3931,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -4224,7 +4224,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4515,7 +4515,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4920,7 +4920,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5504,7 +5504,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5795,7 +5795,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6102,7 +6102,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV @@ -6196,7 +6196,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6427,7 +6427,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV @@ -6518,7 +6518,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll new file mode 100644 index 0000000000000..4aa49f2c9296d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=gfx1201 %s -o - | FileCheck %s + +define amdgpu_kernel void @foo() { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 +; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 +; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: s_endpgm +entry: + br label %bb1 + +bb0: + br label %bb1 + +bb1: + %dst = phi ptr [ null, %bb0 ], [ addrspacecast (ptr addrspace(3) null to ptr), %entry ] + store i64 0, ptr %dst, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index eb4cba35e9946..44a2c34b06b57 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -44,15 +44,19 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-LABEL: indirect_call_known_no_special_inputs: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_getpc_b64 s[6:7] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 +; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+24 ; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], 0 ; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s9, s9 -; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+24 ; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0 @@ -61,12 +65,13 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s8, 1, s12 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_eq_u32 s8, 1 ; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX12-NEXT: s_cselect_b32 s7, s7, s5 ; GFX12-NEXT: s_cselect_b32 s6, s6, s4 ; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 3b972352e0e45..0045082eedb0a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -282,13 +282,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result @@ -401,15 +403,18 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) { ; GFX12-NEXT: scratch_load_b32 v32, off, s32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+16 +; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+24 ; GFX12-NEXT: scratch_store_b32 off, v32, s32 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[0:1] entry: %alloca = alloca double, align 8, addrspace(5) @@ -587,25 +592,34 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cvt_u32_f32 s4, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_i32 s5, s5, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_co_i32 s4, s4, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s5, s4, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sub_co_i32 s2, s2, s5 ; GFX12-NEXT: s_add_co_i32 s5, s4, 1 ; GFX12-NEXT: s_sub_co_i32 s6, s2, s3 ; GFX12-NEXT: s_cmp_ge_u32 s2, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cselect_b32 s4, s5, s4 ; GFX12-NEXT: s_cselect_b32 s2, s6, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_co_i32 s5, s4, 1 ; GFX12-NEXT: s_cmp_ge_u32 s2, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cselect_b32 s2, s5, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -789,9 +803,11 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s0, s0, 5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_u32 v0, v1 @@ -1036,15 +1052,18 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1220,13 +1239,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1645,4 +1667,3 @@ entry: %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc } - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index 9445f1225e0cb..99f4fbf359948 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -64,6 +64,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -83,6 +84,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -100,6 +102,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -169,6 +172,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -188,6 +192,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -205,6 +210,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index aad74410d1453..c4a86952bc414 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -587,7 +587,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -664,9 +665,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -680,9 +681,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -728,7 +729,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -805,9 +807,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -821,9 +823,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -3298,7 +3300,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3344,7 +3347,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3421,9 +3425,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -3437,9 +3441,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -3516,9 +3520,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -3532,9 +3536,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 320b0b4508b6a..8a0602e0472b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -81,23 +81,27 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr6 ; GFX12-NEXT: ; implicit-def: $vgpr5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 8bfe996c6a90a..6e029f7c0a95e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -210,6 +210,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) { ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal m0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var: @@ -489,6 +490,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst m0 ; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: global_load_b32 v1, v[2:3], off @@ -516,8 +518,9 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe ; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0 +; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe ; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 ; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off @@ -741,6 +744,7 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_init m0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0: @@ -752,11 +756,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe ; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0 ; GLOBAL-ISEL-NEXT: s_barrier_init m0 +; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2) ret void @@ -945,6 +950,7 @@ define void @test5_s_barrier_join_m0(i32 %arg) { ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_join m0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0: @@ -1202,6 +1208,7 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_wakeup_barrier m0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0: @@ -1386,10 +1393,11 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { ; GCN-NEXT: s_wait_bvhcnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_get_barrier_state s0, m0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1403,7 +1411,8 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 78204dfefc80c..2efade9fcbba1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -50,23 +50,27 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x bfloat> %ret @@ -90,22 +94,26 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 1005996003044..d5b5c71cc42a9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -306,22 +306,26 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -444,22 +448,26 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 5f6a67e466020..a312a3cb0a95c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -235,23 +235,27 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %ret @@ -344,23 +348,27 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index bd803c380e90a..8d1dce76d2cc8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index c9b50eddc94ee..06b1a9cc70513 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll index df5533b629502..5ea89bc574910 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll @@ -49,7 +49,7 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 5d3a5800bcdd8..bc0daf95e329c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -515,11 +515,13 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_maximum_f16 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call half @llvm.maximum.f16(half %src0, half %src1) %cast = bitcast half %op to i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index e6655aeab7e9b..6b61931fc9414 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -485,6 +485,7 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call float @llvm.maximum.f32(float %src0, float %src1) call void asm sideeffect "; use $0", "s"(float %op) @@ -888,6 +889,7 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:1] ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1) call void asm sideeffect "; use $0", "s"(<2 x float> %op) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 01effc24e741d..77b5682a2dbd1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -424,11 +424,13 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_minimum_f16 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call half @llvm.minimum.f16(half %src0, half %src1) %cast = bitcast half %op to i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 518fc27c23082..8753dc50c4da4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -485,6 +485,7 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call float @llvm.minimum.f32(float %src0, float %src1) call void asm sideeffect "; use $0", "s"(float %op) @@ -888,6 +889,7 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:1] ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1) call void asm sideeffect "; use $0", "s"(<2 x float> %op) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 7178eaf2e7384..0221f9992ad43 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -2360,6 +2360,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2 ; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2 ; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3 ; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3 ; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3 @@ -2397,6 +2398,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015 ; GFX12-NEXT: v_and_b32_e32 v22, 1, v2 ; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11 ; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7 ; GFX12-NEXT: v_and_b32_e32 v4, 1, v5 @@ -2794,6 +2796,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000 ; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013 ; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3 ; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3 ; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3 @@ -2807,7 +2810,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016 ; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014 ; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2 ; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1 ; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9 @@ -3454,6 +3457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 ; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3 ; GFX12-NEXT: v_and_b32_e32 v44, 1, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4 ; GFX12-NEXT: s_lshr_b32 s5, s2, 24 ; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2 @@ -3467,14 +3471,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3 ; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3 ; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5 ; GFX12-NEXT: s_and_b32 s7, s2, 1 ; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9 ; GFX12-NEXT: v_and_b32_e32 v9, 1, v1 ; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4 ; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3 ; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5 ; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3 @@ -3489,30 +3495,34 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012 ; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011 ; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13 ; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17 ; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5 ; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14 ; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11 ; GFX12-NEXT: v_and_b32_e32 v11, 1, v1 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5 ; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7 ; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5 ; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10 ; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5 ; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6 ; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5 ; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10014 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4 ; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 @@ -3522,7 +3532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3 ; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 ; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15 ; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 ; GFX12-NEXT: v_lshrrev_b16 v8, 14, s2 @@ -3541,6 +3551,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015 ; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6 ; GFX12-NEXT: v_and_b32_e32 v6, 1, v17 ; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v23 @@ -4266,6 +4277,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 ; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2 ; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4 ; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4 ; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4 @@ -4311,7 +4323,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s20, s3, 0x10016 ; GFX12-NEXT: s_bfe_i32 s21, s3, 0x10014 ; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3 ; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1 ; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1 @@ -6791,6 +6803,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2 ; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3 ; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3 ; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3 @@ -6808,6 +6821,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v28, 1, v21 ; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2 ; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 @@ -6817,6 +6831,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v11, 1, v11 ; GFX12-NEXT: v_and_b32_e32 v13, 1, v13 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 @@ -6827,6 +6842,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v9, 1, v17 ; GFX12-NEXT: v_and_b32_e32 v29, 1, v23 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2 @@ -6842,6 +6858,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13 ; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1 ; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26 @@ -7554,6 +7571,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v62, v[30:33], s[0:1] offset:64 ; GFX12-NEXT: global_store_b128 v62, v[26:29], s[0:1] offset:48 ; GFX12-NEXT: global_store_b128 v62, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v62, v[4:7], s[0:1] offset:16 @@ -8449,6 +8467,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v43, 1, v10 ; GFX12-NEXT: v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2 ; GFX12-NEXT: v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3 ; GFX12-NEXT: v_mov_b32_e32 v66, v1 ; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1 @@ -8457,6 +8476,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3 ; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3 ; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4 ; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4 ; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10016 @@ -8465,6 +8485,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v45, 1, v12 ; GFX12-NEXT: v_and_b32_e32 v41, 1, v16 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s9 ; GFX12-NEXT: v_mov_b32_e32 v0, s8 ; GFX12-NEXT: s_lshr_b32 s5, s2, 24 @@ -8473,6 +8494,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8 ; GFX12-NEXT: v_and_b32_e32 v44, 1, v14 ; GFX12-NEXT: v_and_b32_e32 v14, 1, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5 ; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5 ; GFX12-NEXT: v_lshrrev_b16 v10, 3, s5 @@ -8483,6 +8505,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013 ; GFX12-NEXT: v_and_b32_e32 v33, 1, v20 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s9 ; GFX12-NEXT: v_mov_b32_e32 v0, s8 ; GFX12-NEXT: v_lshrrev_b16 v9, 15, s3 @@ -8509,6 +8532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s8 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 @@ -8518,6 +8542,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v82, 0xffff, v35 ; GFX12-NEXT: v_and_b32_e32 v35, 1, v27 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1 ; GFX12-NEXT: v_and_b32_e32 v81, 0xffff, v4 @@ -8529,6 +8554,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v31 ; GFX12-NEXT: v_and_b32_e32 v31, 1, v29 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s8 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 @@ -8538,6 +8564,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v21, 2, s2 ; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v33 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s8 ; GFX12-NEXT: v_lshrrev_b16 v15, 8, s2 @@ -8561,6 +8588,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v39 ; GFX12-NEXT: v_and_b32_e32 v39, 1, v25 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7 ; GFX12-NEXT: v_and_b32_e32 v79, 0xffff, v5 @@ -9818,6 +9846,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43 ; GFX12-NEXT: v_bfe_i32 v79, v1, 0, 1 ; GFX12-NEXT: v_bfe_i32 v85, v65, 0, 1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v65, s40 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:144 @@ -9903,6 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_ashrrev_i32_e32 v50, 31, v49 ; GFX12-NEXT: v_ashrrev_i32_e32 v88, 31, v87 ; GFX12-NEXT: v_ashrrev_i32_e32 v86, 31, v85 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4 ; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 22b718935738b..2ee1c60b4bbf2 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2974,6 +2974,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-NEXT: s_and_b32 s7, s7, 0xffff ; GFX12-NEXT: s_lshr_b32 s25, s6, 16 ; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28 ; GFX12-NEXT: v_mov_b32_e32 v10, s11 ; GFX12-NEXT: s_lshr_b32 s22, s5, 16 @@ -3464,6 +3465,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-NEXT: s_ashr_i32 s25, s6, 16 ; GFX12-NEXT: s_sext_i32_i16 s7, s7 ; GFX12-NEXT: s_sext_i32_i16 s6, s6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28 ; GFX12-NEXT: v_mov_b32_e32 v10, s11 ; GFX12-NEXT: s_ashr_i32 s22, s5, 16 @@ -5795,10 +5797,10 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -6030,6 +6032,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 @@ -6370,23 +6373,27 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s2, 0xffff, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0 ; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s6 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -6966,36 +6973,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: s_lshr_b32 s5, s4, 16 ; GFX12-NEXT: s_and_b32 s4, s4, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshr_b32 s4, s7, 16 ; GFX12-NEXT: s_and_b32 s5, s7, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s4, s6, 16 ; GFX12-NEXT: s_and_b32 s5, s6, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 ; GFX12-NEXT: s_and_b32 s3, s3, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 ; GFX12-NEXT: s_and_b32 s2, s2, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_lshr_b32 s2, s1, 16 ; GFX12-NEXT: s_and_b32 s1, s1, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_lshr_b32 s1, s0, 16 ; GFX12-NEXT: s_and_b32 s0, s0, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] @@ -8047,76 +8061,91 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: s_lshr_b32 s15, s14, 16 ; GFX12-NEXT: s_and_b32 s14, s14, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s14 ; GFX12-NEXT: v_mov_b32_e32 v2, s15 ; GFX12-NEXT: s_lshr_b32 s14, s13, 16 ; GFX12-NEXT: s_and_b32 s13, s13, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224 ; GFX12-NEXT: v_mov_b32_e32 v0, s13 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s14 ; GFX12-NEXT: s_lshr_b32 s13, s12, 16 ; GFX12-NEXT: s_and_b32 s12, s12, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s12 ; GFX12-NEXT: v_mov_b32_e32 v2, s13 ; GFX12-NEXT: s_lshr_b32 s12, s11, 16 ; GFX12-NEXT: s_and_b32 s11, s11, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192 ; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s12 ; GFX12-NEXT: s_lshr_b32 s11, s10, 16 ; GFX12-NEXT: s_and_b32 s10, s10, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s10 ; GFX12-NEXT: v_mov_b32_e32 v2, s11 ; GFX12-NEXT: s_lshr_b32 s10, s9, 16 ; GFX12-NEXT: s_and_b32 s9, s9, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160 ; GFX12-NEXT: v_mov_b32_e32 v0, s9 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s9, s8, 16 ; GFX12-NEXT: s_and_b32 s8, s8, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s8 ; GFX12-NEXT: v_mov_b32_e32 v2, s9 ; GFX12-NEXT: s_lshr_b32 s8, s7, 16 ; GFX12-NEXT: s_and_b32 s7, s7, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128 ; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s8 ; GFX12-NEXT: s_lshr_b32 s7, s6, 16 ; GFX12-NEXT: s_and_b32 s6, s6, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: v_mov_b32_e32 v2, s7 ; GFX12-NEXT: s_lshr_b32 s6, s5, 16 ; GFX12-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96 ; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: s_lshr_b32 s5, s4, 16 ; GFX12-NEXT: s_and_b32 s4, s4, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 ; GFX12-NEXT: s_and_b32 s3, s3, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 ; GFX12-NEXT: s_and_b32 s2, s2, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_lshr_b32 s2, s1, 16 ; GFX12-NEXT: s_and_b32 s1, s1, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32 ; GFX12-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_lshr_b32 s1, s0, 16 ; GFX12-NEXT: s_and_b32 s0, s0, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] @@ -8926,6 +8955,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61 ; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59 ; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57 ; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55 @@ -8937,6 +8967,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192 ; GFX12-NEXT: v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12 ; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44 ; GFX12-NEXT: v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index f1a6bccc559f0..4ab55164e0999 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -4390,6 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22 ; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23 ; GFX12-NEXT: v_mov_b32_e32 v5, s56 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 67a376b8c0f3c..7f26ad7009e44 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -1119,8 +1119,8 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_and_b32 s3, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1223,8 +1223,8 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_sext_i32_i8 s3, s2 ; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] @@ -1332,6 +1332,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -1439,6 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_ashr_i32 s3, s2, 24 ; GFX12-NEXT: s_sext_i32_i8 s4, s2 ; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -1597,6 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 ; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5 ; GFX12-NEXT: v_mov_b32_e32 v6, s3 @@ -1761,6 +1764,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_sext_i32_i8 s3, s3 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6 ; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX12-NEXT: v_mov_b32_e32 v4, s3 @@ -2018,11 +2022,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s3, s5, 24 ; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3 ; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1 ; GFX12-NEXT: s_lshr_b32 s2, s4, 24 ; GFX12-NEXT: s_and_b32 s10, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2 ; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5 ; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9 @@ -2294,6 +2300,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX12-NEXT: s_ashr_i32 s2, s4, 24 @@ -2305,6 +2312,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v8, s5 ; GFX12-NEXT: v_mov_b32_e32 v10, s9 ; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v12, s4 ; GFX12-NEXT: v_mov_b32_e32 v14, s3 ; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 @@ -2753,7 +2761,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_and_b32 s21, s7, 0xff ; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13 ; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -2767,6 +2775,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14 ; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20 ; GFX12-NEXT: s_lshr_b32 s3, s5, 24 ; GFX12-NEXT: s_and_b32 s19, s5, 0xff @@ -2776,6 +2785,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_and_b32 s18, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 @@ -3263,6 +3273,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_sext_i32_i8 s7, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22 ; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8 @@ -3276,6 +3287,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18 ; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16 ; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2 ; GFX12-NEXT: v_mov_b32_e32 v30, s19 ; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010 @@ -3288,6 +3300,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v20, s6 ; GFX12-NEXT: v_mov_b32_e32 v22, s15 ; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v16, s5 ; GFX12-NEXT: v_mov_b32_e32 v18, s13 ; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 @@ -4116,11 +4129,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0 ; GFX12-NEXT: v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5 ; GFX12-NEXT: v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v58, s15 ; GFX12-NEXT: s_and_b32 s43, s8, 0xff ; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX12-NEXT: s_and_b32 s48, s13, 0xff ; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0 ; GFX12-NEXT: v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8 ; GFX12-NEXT: s_lshr_b32 s27, s9, 24 @@ -4132,6 +4147,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX12-NEXT: s_and_b32 s47, s12, 0xff ; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2 ; GFX12-NEXT: v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42 ; GFX12-NEXT: s_lshr_b32 s25, s7, 24 @@ -4139,6 +4155,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12 ; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7 ; GFX12-NEXT: s_lshr_b32 s28, s10, 24 ; GFX12-NEXT: s_lshr_b32 s29, s11, 24 @@ -4148,6 +4165,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_and_b32 s45, s10, 0xff ; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX12-NEXT: s_and_b32 s46, s11, 0xff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4 ; GFX12-NEXT: v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41 ; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010 @@ -4162,10 +4180,11 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v60, v[52:55], s[16:17] offset:224 ; GFX12-NEXT: global_store_b128 v60, v[48:51], s[16:17] offset:208 ; GFX12-NEXT: global_store_b128 v60, v[44:47], s[16:17] offset:192 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29 ; GFX12-NEXT: v_mov_b32_e32 v24, s6 ; GFX12-NEXT: s_and_b32 s40, s5, 0xff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40 ; GFX12-NEXT: s_lshr_b32 s23, s5, 24 ; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 @@ -4175,6 +4194,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_and_b32 s39, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4 ; GFX12-NEXT: s_lshr_b32 s21, s3, 24 ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 @@ -4187,10 +4207,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v60, v[26:29], s[16:17] offset:112 ; GFX12-NEXT: global_store_b128 v60, v[22:25], s[16:17] offset:96 ; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v14, s3 ; GFX12-NEXT: s_lshr_b32 s20, s2, 24 ; GFX12-NEXT: s_and_b32 s37, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37 ; GFX12-NEXT: s_lshr_b32 s19, s1, 24 ; GFX12-NEXT: s_and_b32 s36, s1, 0xff @@ -4199,6 +4221,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s18, s0, 24 ; GFX12-NEXT: s_and_b32 s35, s0, 0xff ; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36 ; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19 ; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18 @@ -5061,6 +5084,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_ashr_i32 s47, s14, 24 ; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s14, s14 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15 ; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11 ; GFX12-NEXT: s_ashr_i32 s45, s13, 24 @@ -5080,6 +5104,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s11, s11 ; GFX12-NEXT: v_bfe_i32 v45, v3, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43 ; GFX12-NEXT: v_mov_b32_e32 v46, s46 ; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 @@ -5104,12 +5129,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v59, v[40:43], s[16:17] offset:192 ; GFX12-NEXT: v_mov_b32_e32 v41, s39 ; GFX12-NEXT: v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v37, s37 ; GFX12-NEXT: s_ashr_i32 s33, s7, 24 ; GFX12-NEXT: s_ashr_i32 s35, s8, 24 ; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s8, s8 ; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35 ; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33 ; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 @@ -5131,6 +5158,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s5, s5 ; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26 ; GFX12-NEXT: v_mov_b32_e32 v32, s36 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 @@ -5158,6 +5186,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28 ; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s2, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v16, s4 ; GFX12-NEXT: v_mov_b32_e32 v18, s27 ; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010 @@ -5171,6 +5200,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v8, s2 ; GFX12-NEXT: v_mov_b32_e32 v10, s23 ; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s21 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 @@ -5869,7 +5899,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX12-NEXT: s_lshr_b32 s4, s2, 24 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -5877,6 +5907,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_and_b32 s2, s2, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -6027,6 +6058,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v6, s6 @@ -6225,26 +6257,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_lshr_b32 s5, s3, 24 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_lshr_b32 s4, s2, 24 ; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s2, s2, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: s_and_b32 s2, s3, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 @@ -6490,7 +6525,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9 @@ -6831,47 +6866,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_lshr_b32 s3, s7, 24 ; GFX12-NEXT: s_lshr_b32 s2, s5, 24 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_lshr_b32 s2, s6, 24 ; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_lshr_b32 s2, s4, 24 ; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_and_b32 s2, s6, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 ; GFX12-NEXT: s_and_b32 s2, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 ; GFX12-NEXT: s_and_b32 s2, s5, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: s_and_b32 s2, s4, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -7303,6 +7346,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5 @@ -7939,48 +7983,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 ; GFX12-NEXT: s_lshr_b32 s11, s7, 24 ; GFX12-NEXT: s_lshr_b32 s10, s5, 24 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 ; GFX12-NEXT: s_and_b32 s7, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s3, 24 ; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s1, 24 ; GFX12-NEXT: s_bfe_u32 s11, s1, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s6, 24 ; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010 ; GFX12-NEXT: s_and_b32 s6, s6, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s4, 24 ; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s2, 24 ; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s0, 24 ; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 @@ -7996,6 +8048,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 ; GFX12-NEXT: s_and_b32 s4, s4, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 @@ -8008,6 +8061,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2 ; GFX12-NEXT: s_and_b32 s2, s2, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 @@ -8020,6 +8074,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0 ; GFX12-NEXT: s_and_b32 s0, s0, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 @@ -8866,6 +8921,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47 ; GFX12-NEXT: v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5 ; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15 ; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13 @@ -9605,14 +9661,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 ; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3 ; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2 ; GFX12-NEXT: s_lshr_b32 s2, s2, 24 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1 ; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -9758,11 +9816,13 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 ; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4 ; GFX12-NEXT: s_ashr_i32 s2, s2, 24 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -9961,6 +10021,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: s_lshr_b32 s2, s3, 24 ; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3 ; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -10187,8 +10248,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: s_ashr_i32 s2, s2, 24 ; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000 ; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5 @@ -10537,6 +10600,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5 ; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6 ; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11 ; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12 ; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9 @@ -10926,6 +10990,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5 ; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6 ; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000 @@ -10937,14 +11002,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5 ; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12 ; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6 ; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11 ; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10 ; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000 ; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3 ; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2 ; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4 ; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 @@ -11566,6 +11633,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6 ; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10 ; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14 ; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12 ; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13 @@ -11580,6 +11648,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s16, s5, 24 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 @@ -11593,11 +11662,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s18, s2, 24 ; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9 ; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13 ; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17 ; GFX12-NEXT: s_lshr_b32 s22, s0, 24 ; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19 ; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48 @@ -12316,6 +12387,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0 ; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000 ; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 ; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2 ; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20 @@ -12323,6 +12395,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3 ; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000 ; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18 @@ -12333,9 +12406,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2 ; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10 ; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000 ; GFX12-NEXT: s_lshr_b32 s11, s7, 16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000 ; GFX12-NEXT: s_lshr_b32 s10, s6, 16 @@ -12344,9 +12419,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8 ; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13 ; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000 ; GFX12-NEXT: s_lshr_b32 s13, s5, 16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000 ; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17 @@ -12355,6 +12432,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6 ; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11 ; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1 ; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000 ; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12 @@ -12362,11 +12440,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000 ; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3 ; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5 ; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14 ; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v11, s0 ; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15 ; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 0894d3251423d..295ae94902da7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -503,12 +503,14 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64: @@ -694,12 +696,14 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64__offset: @@ -884,12 +888,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64: @@ -1066,12 +1072,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1267,13 +1275,15 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16: @@ -1573,13 +1583,15 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1887,12 +1899,14 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16: @@ -2181,12 +2195,14 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2474,13 +2490,15 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4: @@ -2710,12 +2728,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2958,13 +2978,15 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16: @@ -3312,13 +3334,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3674,12 +3698,14 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16: @@ -4016,12 +4042,14 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4357,13 +4385,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4: @@ -4648,12 +4678,14 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -7018,11 +7050,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 ; GFX12-NEXT: s_lshl_b32 s5, s1, 3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7030,25 +7063,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB28_4 ; GFX12-NEXT: ; %bb.3: ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX12-NEXT: s_lshl_b32 s0, s1, 4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_4: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo @@ -7061,22 +7096,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ctz_i32_b32 s5, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 ; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: @@ -7086,6 +7125,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_8: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 @@ -7885,32 +7925,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 ; GFX12-NEXT: s_lshl_b32 s5, s1, 3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB29_4 ; GFX12-NEXT: ; %bb.3: ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX12-NEXT: s_lshl_b32 s0, s1, 4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: .LBB29_4: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo @@ -7923,28 +7967,33 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ctz_i32_b32 s5, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 ; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: .LBB29_8: +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 6dec36c316ee3..cc79db1b20af4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -816,13 +816,15 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16: @@ -1129,13 +1131,15 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1450,12 +1454,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16: @@ -1752,12 +1758,14 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2053,13 +2061,15 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4: @@ -2297,12 +2307,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16: @@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3272,12 +3288,14 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16: @@ -3616,12 +3634,14 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4: @@ -4252,12 +4274,14 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16: @@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5073,12 +5101,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16: @@ -5334,12 +5364,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16: @@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -6370,12 +6406,14 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6733,12 +6771,14 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index b3132a2fa80dd..1ffd93e35d8cd 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -816,13 +816,15 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16: @@ -1129,13 +1131,15 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1450,12 +1454,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16: @@ -1752,12 +1758,14 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2053,13 +2061,15 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4: @@ -2297,12 +2307,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16: @@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3272,12 +3288,14 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16: @@ -3616,12 +3634,14 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4: @@ -4252,12 +4274,14 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16: @@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5073,12 +5101,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16: @@ -5334,12 +5364,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16: @@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -6370,12 +6406,14 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6733,12 +6771,14 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 5ebeddd04b2ae..9bc8bafc34a68 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -35,13 +35,15 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32: @@ -246,13 +248,15 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__offset: @@ -457,12 +461,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32: @@ -657,12 +663,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__offset: @@ -865,12 +873,14 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64: @@ -1081,12 +1091,14 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1296,12 +1308,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64: @@ -1501,12 +1515,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1725,13 +1741,15 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16: @@ -2031,13 +2049,15 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2345,12 +2365,14 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16: @@ -2639,12 +2661,14 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2932,13 +2956,15 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4: @@ -3168,12 +3194,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3416,13 +3444,15 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16: @@ -3770,13 +3800,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4132,12 +4164,14 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16: @@ -4474,12 +4508,14 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4815,13 +4851,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4: @@ -5106,12 +5144,14 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5381,13 +5421,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16: @@ -5635,13 +5677,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset: @@ -5888,12 +5932,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16: @@ -6130,12 +6176,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6398,13 +6446,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16: @@ -6774,13 +6824,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -7150,12 +7202,14 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7513,12 +7567,14 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7864,13 +7920,15 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: @@ -8074,12 +8132,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 390d1d70ff2aa..df954f6f940c8 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB0_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 ; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 @@ -149,6 +149,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: .LBB3_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_wait_alu 0xfffe ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: s_add_co_i32 s2, s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index fef1b57db5685..acba2841a7107 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -200,6 +200,7 @@ define amdgpu_kernel void @caller() { ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -212,6 +213,7 @@ define amdgpu_kernel void @caller() { ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() @@ -276,9 +278,10 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 ; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v8, s1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll index f90753652baa5..1da05ed264a64 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -107,6 +107,7 @@ define amdgpu_cs void @caller() { ; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi ; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -116,6 +117,7 @@ define amdgpu_cs void @caller() { ; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo ; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() @@ -182,9 +184,10 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 ; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v8, s1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 94d1eca05ed0e..8a789a4c6cda9 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -106,17 +106,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bitcmp1_b32 s0, 0 ; GFX12-NEXT: s_bitset0_b32 s0, 0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc @@ -124,8 +127,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: @@ -313,10 +318,12 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 @@ -327,8 +334,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc: @@ -530,17 +539,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: s_mov_b32 s33, s32 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s33 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bitcmp1_b32 s0, 0 ; GFX12-NEXT: s_bitset0_b32 s0, 0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc @@ -548,10 +560,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -745,6 +759,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_mov_b32 s59, s32 @@ -756,8 +771,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: @@ -911,6 +928,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_mov_b32 s59, s32 @@ -921,8 +939,10 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: @@ -1092,6 +1112,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: s_mov_b32 s33, s32 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 @@ -1103,10 +1124,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1292,6 +1315,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: s_mov_b32 s33, s32 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_mov_b32 s59, s33 @@ -1303,10 +1327,12 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1492,9 +1518,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 ; GFX12-NEXT: v_writelane_b32 v2, s59, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART @@ -1509,8 +1537,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: v_readlane_b32 s59, v2, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: @@ -1710,10 +1740,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000 ; GFX12-NEXT: v_writelane_b32 v2, s59, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_add_nc_u32_e64 v1, s0, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo @@ -1728,8 +1760,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: v_readlane_b32 s59, v2, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 6346406fa8941..9829b7e787d47 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -673,6 +673,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v23, s30, 0 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 @@ -711,13 +712,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0x4000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bitcmp1_b32 s32, 0 ; GFX12-NEXT: v_writelane_b32 v23, s59, 28 ; GFX12-NEXT: s_bitset0_b32 s32, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0xffffc000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bitcmp1_b32 s32, 0 ; GFX12-NEXT: s_bitset0_b32 s32, 0 ; GFX12-NEXT: ;;#ASMSTART @@ -754,8 +756,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: v_readlane_b32 s30, v23, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -1396,6 +1400,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v21, s30, 0 ; GFX12-NEXT: v_writelane_b32 v21, s31, 1 @@ -1466,8 +1471,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: v_readlane_b32 s30, v21, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 16, addrspace(5) @@ -2196,16 +2203,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v23, s30, 0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: v_writelane_b32 v23, s31, 1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v1 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 ; GFX12-NEXT: v_writelane_b32 v23, s33, 2 ; GFX12-NEXT: v_writelane_b32 v23, s34, 3 @@ -2271,8 +2280,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: v_readlane_b32 s30, v23, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 45e8b3bcff13c..e9b4ec52599a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-WGP-LABEL: flat_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-CU-LABEL: flat_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-WGP-LABEL: flat_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-LABEL: flat_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -547,6 +551,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-WGP-LABEL: flat_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -565,6 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-LABEL: flat_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -765,6 +771,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-WGP-LABEL: flat_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -787,6 +794,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-LABEL: flat_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1550,6 +1558,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1562,6 +1571,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1729,6 +1739,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1743,6 +1754,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1901,6 +1913,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1918,6 +1931,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-LABEL: flat_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2106,6 +2120,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2125,6 +2140,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2315,6 +2331,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2334,6 +2351,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3215,6 +3233,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3231,6 +3250,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3473,6 +3493,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3491,6 +3512,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3724,6 +3746,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3745,6 +3768,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4008,6 +4032,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4031,6 +4056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4296,6 +4322,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4319,6 +4346,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4568,6 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4586,6 +4615,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4830,6 +4860,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4848,6 +4879,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5108,6 +5140,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5131,6 +5164,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5396,6 +5430,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5419,6 +5454,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5684,6 +5720,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5707,6 +5744,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5972,6 +6010,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5995,6 +6034,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6260,6 +6300,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6283,6 +6324,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6548,6 +6590,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6571,6 +6614,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6836,6 +6880,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6859,6 +6904,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7124,6 +7170,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7147,6 +7194,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7401,6 +7449,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7421,6 +7470,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7687,6 +7737,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7710,6 +7761,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7982,6 +8034,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8007,6 +8060,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8294,6 +8348,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8322,6 +8377,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8612,6 +8668,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8640,6 +8697,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8914,6 +8972,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8937,6 +8996,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9206,6 +9266,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9229,6 +9290,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9514,6 +9576,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9542,6 +9605,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9832,6 +9896,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9860,6 +9925,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10150,6 +10216,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10178,6 +10245,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10468,6 +10536,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10496,6 +10565,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10786,6 +10856,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10814,6 +10885,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11104,6 +11176,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11132,6 +11205,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11422,6 +11496,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11450,6 +11525,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11740,6 +11816,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11768,6 +11845,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11953,6 +12031,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11968,6 +12047,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12138,6 +12218,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12153,6 +12234,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12344,6 +12426,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12363,6 +12446,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12572,6 +12656,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12595,6 +12680,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13359,6 +13445,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13371,6 +13458,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13534,6 +13622,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13548,6 +13637,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13706,6 +13796,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13723,6 +13814,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13907,6 +13999,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13926,6 +14019,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14112,6 +14206,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14131,6 +14226,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15042,6 +15138,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15058,6 +15155,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15296,6 +15394,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15314,6 +15413,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15547,6 +15647,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15568,6 +15669,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15827,6 +15929,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15850,6 +15953,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16111,6 +16215,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16134,6 +16239,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16379,6 +16485,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16397,6 +16504,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16637,6 +16745,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16655,6 +16764,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16911,6 +17021,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16934,6 +17045,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17195,6 +17307,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17218,6 +17331,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17479,6 +17593,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17502,6 +17617,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17763,6 +17879,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17786,6 +17903,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18047,6 +18165,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18070,6 +18189,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18331,6 +18451,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18354,6 +18475,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18615,6 +18737,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18638,6 +18761,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18899,6 +19023,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18922,6 +19047,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19176,6 +19302,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19196,6 +19323,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19470,6 +19598,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19494,6 +19623,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19767,6 +19897,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19792,6 +19923,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20087,6 +20219,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20116,6 +20249,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20415,6 +20549,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20444,6 +20579,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20727,6 +20863,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20751,6 +20888,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21029,6 +21167,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21053,6 +21192,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21347,6 +21487,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21376,6 +21517,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21675,6 +21817,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21704,6 +21847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22003,6 +22147,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22032,6 +22177,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22331,6 +22477,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22360,6 +22507,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22659,6 +22807,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22688,6 +22837,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22987,6 +23137,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23016,6 +23167,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23315,6 +23467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23344,6 +23497,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23643,6 +23797,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23672,6 +23827,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index fb40274cac1ba..5c59481c59853 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -6,6 +6,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -29,18 +30,23 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-NEXT: s_mov_b32 s2, 2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, s4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_mov_b32 s2, s5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0 ; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 @@ -64,6 +70,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -88,6 +95,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX12-LABEL: flat_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 5fa8e6891bafb..b2340caa2933f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -475,18 +477,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 @@ -504,18 +511,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 @@ -688,6 +700,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -703,6 +716,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1007,17 +1021,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 @@ -1036,17 +1055,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX12-CU-NEXT: s_mov_b32 s0, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 @@ -1224,6 +1248,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1242,6 +1267,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-LABEL: flat_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 4c9ce15211e34..304c80d7bb24d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-WGP-LABEL: flat_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-CU-LABEL: flat_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-CU-LABEL: flat_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-WGP-LABEL: flat_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-CU-LABEL: flat_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20317,6 +20471,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20337,6 +20492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index e77f1432c1c9d..3502a29edeecb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-WGP-LABEL: flat_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-CU-LABEL: flat_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-WGP-LABEL: flat_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-LABEL: flat_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -549,6 +553,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-WGP-LABEL: flat_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -567,6 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-LABEL: flat_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -769,6 +775,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-WGP-LABEL: flat_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -791,6 +798,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-LABEL: flat_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1558,6 +1566,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1570,6 +1579,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1739,6 +1749,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1753,6 +1764,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1913,6 +1925,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-WGP-LABEL: flat_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1930,6 +1943,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-LABEL: flat_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2122,6 +2136,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2141,6 +2156,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2335,6 +2351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2354,6 +2371,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3245,6 +3263,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3261,6 +3280,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3505,6 +3525,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3523,6 +3544,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3758,6 +3780,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3779,6 +3802,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4046,6 +4070,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4069,6 +4094,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4338,6 +4364,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4361,6 +4388,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4612,6 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4630,6 +4659,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4876,6 +4906,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4894,6 +4925,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5158,6 +5190,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5181,6 +5214,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5450,6 +5484,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5473,6 +5508,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5742,6 +5778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5765,6 +5802,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6034,6 +6072,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6057,6 +6096,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6326,6 +6366,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6349,6 +6390,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6618,6 +6660,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6641,6 +6684,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6910,6 +6954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6933,6 +6978,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7202,6 +7248,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7225,6 +7272,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7479,6 +7527,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7499,6 +7548,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7767,6 +7817,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7790,6 +7841,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8064,6 +8116,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8089,6 +8142,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8380,6 +8434,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8408,6 +8463,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8702,6 +8758,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8730,6 +8787,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9006,6 +9064,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9029,6 +9088,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9300,6 +9360,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9323,6 +9384,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9612,6 +9674,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9640,6 +9703,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9934,6 +9998,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9962,6 +10027,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10256,6 +10322,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10284,6 +10351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10578,6 +10646,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10606,6 +10675,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10900,6 +10970,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10928,6 +10999,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11222,6 +11294,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11250,6 +11323,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11544,6 +11618,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11572,6 +11647,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11866,6 +11942,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11894,6 +11971,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12079,6 +12157,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12094,6 +12173,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-CU-LABEL: flat_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12264,6 +12344,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12279,6 +12360,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12472,6 +12554,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12491,6 +12574,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-LABEL: flat_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12702,6 +12786,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12725,6 +12810,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13493,6 +13579,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13505,6 +13592,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13670,6 +13758,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13684,6 +13773,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13844,6 +13934,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13861,6 +13952,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14049,6 +14141,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14068,6 +14161,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14258,6 +14352,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14277,6 +14372,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15198,6 +15294,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15214,6 +15311,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15454,6 +15552,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15472,6 +15571,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15707,6 +15807,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15728,6 +15829,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15991,6 +16093,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16014,6 +16117,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16279,6 +16383,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16302,6 +16407,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16549,6 +16655,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16567,6 +16674,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16809,6 +16917,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16827,6 +16936,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17087,6 +17197,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17110,6 +17221,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17375,6 +17487,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17398,6 +17511,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17663,6 +17777,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17686,6 +17801,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17951,6 +18067,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17974,6 +18091,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18239,6 +18357,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18262,6 +18381,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18527,6 +18647,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18550,6 +18671,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18815,6 +18937,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18838,6 +18961,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19103,6 +19227,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19126,6 +19251,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19380,6 +19506,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19400,6 +19527,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19676,6 +19804,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19700,6 +19829,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19975,6 +20105,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20000,6 +20131,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20299,6 +20431,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20328,6 +20461,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20631,6 +20765,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20660,6 +20795,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20945,6 +21081,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20969,6 +21106,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21249,6 +21387,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21273,6 +21412,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21571,6 +21711,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21600,6 +21741,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21903,6 +22045,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21932,6 +22075,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22235,6 +22379,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22264,6 +22409,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22567,6 +22713,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22596,6 +22743,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22899,6 +23047,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22928,6 +23077,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23231,6 +23381,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23260,6 +23411,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23563,6 +23715,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23592,6 +23745,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23895,6 +24049,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23924,6 +24079,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 6bf54ccabc9da..c2b7aa4fcfbf1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -110,6 +110,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -128,6 +129,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -329,18 +331,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 @@ -361,18 +368,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 @@ -498,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -518,6 +531,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -727,17 +741,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 @@ -761,17 +780,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX12-CU-NEXT: s_mov_b32 s0, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 @@ -896,6 +920,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -914,6 +939,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index c7826181cc8dd..23982f8a00cdb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-WGP-LABEL: flat_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-CU-LABEL: flat_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-CU-LABEL: flat_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-WGP-LABEL: flat_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-CU-LABEL: flat_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 8949e4b782f63..cd2c8176b8d33 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-WGP-LABEL: flat_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-CU-LABEL: flat_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-CU-LABEL: flat_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -544,6 +548,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-WGP-LABEL: flat_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -562,6 +567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-CU-LABEL: flat_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -755,6 +761,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -777,6 +784,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1519,6 +1527,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1531,6 +1540,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1687,6 +1697,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1701,6 +1712,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1854,6 +1866,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1871,6 +1884,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2040,6 +2054,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2059,6 +2074,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2229,6 +2245,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2248,6 +2265,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3093,6 +3111,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3109,6 +3128,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3340,6 +3360,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3358,6 +3379,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3586,6 +3608,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3607,6 +3630,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3851,6 +3875,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3874,6 +3899,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4119,6 +4145,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4142,6 +4169,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4375,6 +4403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4393,6 +4422,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4625,6 +4655,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4643,6 +4674,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4887,6 +4919,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4910,6 +4943,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5155,6 +5189,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5178,6 +5213,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5423,6 +5459,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5446,6 +5483,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5691,6 +5729,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5714,6 +5753,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5963,6 +6003,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5983,6 +6024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6246,6 +6288,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6269,6 +6312,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6535,6 +6579,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6560,6 +6605,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6836,6 +6882,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6864,6 +6911,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7141,6 +7189,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7169,6 +7218,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7434,6 +7484,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7457,6 +7508,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7721,6 +7773,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7744,6 +7797,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8020,6 +8074,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8048,6 +8103,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8325,6 +8381,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8353,6 +8410,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8630,6 +8688,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8658,6 +8717,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8935,6 +8995,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8963,6 +9024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9240,6 +9302,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9268,6 +9331,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9545,6 +9609,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9573,6 +9638,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9850,6 +9916,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9878,6 +9945,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10155,6 +10223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10183,6 +10252,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10362,6 +10432,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10377,6 +10448,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10547,6 +10619,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10562,6 +10635,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10738,6 +10812,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10757,6 +10832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10939,6 +11015,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10962,6 +11039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11688,6 +11766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11700,6 +11779,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11848,6 +11928,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11862,6 +11943,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12008,6 +12090,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12025,6 +12108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12179,6 +12263,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12198,6 +12283,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12352,6 +12438,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12371,6 +12458,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13188,6 +13276,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13204,6 +13293,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13427,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13445,6 +13536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13666,6 +13758,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13687,6 +13780,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13916,6 +14010,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13939,6 +14034,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14168,6 +14264,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14191,6 +14288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14414,6 +14512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14432,6 +14531,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14655,6 +14755,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14673,6 +14774,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14902,6 +15004,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14925,6 +15028,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15154,6 +15258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15177,6 +15282,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15406,6 +15512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15429,6 +15536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15658,6 +15766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15681,6 +15790,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15910,6 +16020,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15933,6 +16044,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16162,6 +16274,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16185,6 +16298,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16414,6 +16528,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16437,6 +16552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16666,6 +16782,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16689,6 +16806,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16936,6 +17054,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16956,6 +17075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17215,6 +17335,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17239,6 +17360,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17498,6 +17620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17523,6 +17646,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17788,6 +17912,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17817,6 +17942,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18082,6 +18208,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18111,6 +18238,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18370,6 +18498,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18394,6 +18523,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18653,6 +18783,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18677,6 +18808,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18942,6 +19074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18971,6 +19104,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19236,6 +19370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19265,6 +19400,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19530,6 +19666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19559,6 +19696,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19824,6 +19962,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19853,6 +19992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20118,6 +20258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20147,6 +20288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20412,6 +20554,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20441,6 +20584,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20706,6 +20850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20735,6 +20880,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21000,6 +21146,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21029,6 +21176,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index b56860991b194..4ba64af63e5f5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-WGP-LABEL: global_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -194,6 +195,7 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-CU-LABEL: global_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -378,6 +380,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-WGP-LABEL: global_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -390,6 +393,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-LABEL: global_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -588,6 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-WGP-LABEL: global_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -603,6 +608,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-LABEL: global_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -811,6 +817,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-WGP-LABEL: global_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -830,6 +837,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-LABEL: global_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -995,6 +1003,7 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-WGP-LABEL: global_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1006,6 +1015,7 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-LABEL: global_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1162,6 +1172,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-WGP-LABEL: global_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1173,6 +1184,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-LABEL: global_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1346,6 +1358,7 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-WGP-LABEL: global_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1362,6 +1375,7 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-LABEL: global_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1540,6 +1554,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-WGP-LABEL: global_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1556,6 +1571,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-LABEL: global_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -3395,6 +3411,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3410,6 +3427,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3644,6 +3662,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3661,6 +3680,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3889,6 +3909,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3909,6 +3930,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4165,6 +4187,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4187,6 +4210,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4445,6 +4469,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4467,6 +4492,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4708,6 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4725,6 +4752,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4961,6 +4989,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4978,6 +5007,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5231,6 +5261,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5253,6 +5284,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5511,6 +5543,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5533,6 +5566,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5791,6 +5825,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5813,6 +5848,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6071,6 +6107,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6093,6 +6130,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6351,6 +6389,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6373,6 +6412,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6631,6 +6671,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6653,6 +6694,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6911,6 +6953,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6933,6 +6976,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7191,6 +7235,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7213,6 +7258,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7454,6 +7500,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7471,6 +7518,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7724,6 +7772,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7744,6 +7793,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8002,6 +8052,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8024,6 +8075,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8299,6 +8351,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8324,6 +8377,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8602,6 +8656,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8627,6 +8682,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8888,6 +8944,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8908,6 +8965,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9164,6 +9222,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9184,6 +9243,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9457,6 +9517,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9482,6 +9543,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9760,6 +9822,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9785,6 +9848,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10063,6 +10127,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10088,6 +10153,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10366,6 +10432,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10391,6 +10458,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10669,6 +10737,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10694,6 +10763,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10972,6 +11042,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10997,6 +11068,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11275,6 +11347,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11300,6 +11373,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11578,6 +11652,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11603,6 +11678,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11802,6 +11878,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11814,6 +11891,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-CU-LABEL: global_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11998,6 +12076,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12010,6 +12089,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12208,6 +12288,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12223,6 +12304,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-LABEL: global_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12431,6 +12513,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12450,6 +12533,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -12615,6 +12699,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -12626,6 +12711,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-LABEL: global_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12782,6 +12868,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -12793,6 +12880,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12966,6 +13054,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-WGP-LABEL: global_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -12982,6 +13071,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-LABEL: global_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -13160,6 +13250,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -13176,6 +13267,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -15015,6 +15107,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15030,6 +15123,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15264,6 +15358,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15281,6 +15376,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15509,6 +15605,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15529,6 +15626,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15785,6 +15883,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15807,6 +15906,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16065,6 +16165,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16087,6 +16188,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16328,6 +16430,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16345,6 +16448,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16581,6 +16685,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16598,6 +16703,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16851,6 +16957,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16873,6 +16980,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17131,6 +17239,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17153,6 +17262,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17411,6 +17521,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17433,6 +17544,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17691,6 +17803,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17713,6 +17826,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17971,6 +18085,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17993,6 +18108,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18251,6 +18367,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18273,6 +18390,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18531,6 +18649,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18553,6 +18672,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18811,6 +18931,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18833,6 +18954,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19074,6 +19196,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19091,6 +19214,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19344,6 +19468,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19364,6 +19489,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19637,6 +19763,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19662,6 +19789,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19940,6 +20068,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19965,6 +20094,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20226,6 +20356,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20246,6 +20377,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20502,6 +20634,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20522,6 +20655,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20795,6 +20929,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20820,6 +20955,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21098,6 +21234,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21123,6 +21260,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21401,6 +21539,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21426,6 +21565,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21704,6 +21844,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21729,6 +21870,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22007,6 +22149,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22032,6 +22175,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22310,6 +22454,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22335,6 +22480,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22613,6 +22759,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22638,6 +22785,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22916,6 +23064,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22941,6 +23090,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 7a9cb992a0cd1..0fc3212b0f46d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -6,6 +6,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX12-LABEL: global_last_use_load_0: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -25,13 +26,16 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-LABEL: global_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU @@ -50,6 +54,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-LABEL: global_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -70,13 +75,16 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-LABEL: global_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 9b2b3a4cfa9ba..14f1734235673 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -178,6 +178,7 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-WGP-LABEL: global_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -191,6 +192,7 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-CU-LABEL: global_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -437,13 +439,16 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-WGP-LABEL: global_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT @@ -454,13 +459,16 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-LABEL: global_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT @@ -641,6 +649,7 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-WGP-LABEL: global_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -654,6 +663,7 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-CU-LABEL: global_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -881,13 +891,16 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-WGP-LABEL: global_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -897,13 +910,16 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-LABEL: global_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1087,6 +1103,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-WGP-LABEL: global_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -1101,6 +1118,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-LABEL: global_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index afc46fbc23a67..33aaeebf658dd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-WGP-LABEL: global_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -194,6 +195,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-CU-LABEL: global_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -378,6 +380,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-WGP-LABEL: global_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -390,6 +393,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-CU-LABEL: global_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -574,6 +578,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-WGP-LABEL: global_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -586,6 +591,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-CU-LABEL: global_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -770,6 +776,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -782,6 +789,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-CU-LABEL: global_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -940,6 +948,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-WGP-LABEL: global_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -951,6 +960,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-LABEL: global_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-WGP-LABEL: global_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-LABEL: global_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-WGP-LABEL: global_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-LABEL: global_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-LABEL: global_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-LABEL: global_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 62a4f3b43b2dc..2c877755019ce 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-WGP-LABEL: global_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -194,6 +195,7 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-CU-LABEL: global_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -378,6 +380,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-WGP-LABEL: global_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -390,6 +393,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-LABEL: global_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -590,6 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-WGP-LABEL: global_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -605,6 +610,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-LABEL: global_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -815,6 +821,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-WGP-LABEL: global_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -834,6 +841,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-LABEL: global_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -999,6 +1007,7 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-WGP-LABEL: global_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1010,6 +1019,7 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-LABEL: global_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1166,6 +1176,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-WGP-LABEL: global_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1177,6 +1188,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-LABEL: global_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1352,6 +1364,7 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-WGP-LABEL: global_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1368,6 +1381,7 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-LABEL: global_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1548,6 +1562,7 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-WGP-LABEL: global_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1564,6 +1579,7 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-LABEL: global_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -3425,6 +3441,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3440,6 +3457,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3676,6 +3694,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3693,6 +3712,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3923,6 +3943,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3943,6 +3964,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4203,6 +4225,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4225,6 +4248,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4487,6 +4511,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4509,6 +4534,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4752,6 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4769,6 +4796,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5007,6 +5035,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5024,6 +5053,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5281,6 +5311,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5303,6 +5334,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5565,6 +5597,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5587,6 +5620,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5849,6 +5883,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5871,6 +5906,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6133,6 +6169,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6155,6 +6192,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6396,6 +6434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6413,6 +6452,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6668,6 +6708,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6688,6 +6729,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6965,6 +7007,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6990,6 +7033,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7272,6 +7316,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7297,6 +7342,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7560,6 +7606,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7580,6 +7627,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7838,6 +7886,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7858,6 +7907,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8135,6 +8185,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8160,6 +8211,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8442,6 +8494,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8467,6 +8520,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8749,6 +8803,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8774,6 +8829,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9056,6 +9112,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9081,6 +9138,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9363,6 +9421,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9388,6 +9447,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9670,6 +9730,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9695,6 +9756,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9977,6 +10039,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10002,6 +10065,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10284,6 +10348,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10309,6 +10374,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10508,6 +10574,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-WGP-LABEL: global_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10520,6 +10587,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-CU-LABEL: global_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10704,6 +10772,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10716,6 +10785,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-LABEL: global_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10916,6 +10986,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-WGP-LABEL: global_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10931,6 +11002,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-LABEL: global_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11141,6 +11213,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11160,6 +11233,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11325,6 +11399,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-WGP-LABEL: global_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11336,6 +11411,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-LABEL: global_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11492,6 +11568,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11503,6 +11580,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-LABEL: global_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11678,6 +11756,7 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-WGP-LABEL: global_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11694,6 +11773,7 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-LABEL: global_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11874,6 +11954,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11890,6 +11971,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -13751,6 +13833,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13766,6 +13849,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14002,6 +14086,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14019,6 +14104,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14249,6 +14335,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14269,6 +14356,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14529,6 +14617,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14551,6 +14640,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14813,6 +14903,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14835,6 +14926,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15078,6 +15170,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15095,6 +15188,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15333,6 +15427,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15350,6 +15445,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15607,6 +15703,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15629,6 +15726,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15891,6 +15989,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15913,6 +16012,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16175,6 +16275,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16197,6 +16298,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16459,6 +16561,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16481,6 +16584,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16743,6 +16847,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16765,6 +16870,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17027,6 +17133,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17049,6 +17156,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17311,6 +17419,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17333,6 +17442,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17595,6 +17705,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17617,6 +17728,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17858,6 +17970,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17875,6 +17988,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18130,6 +18244,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18150,6 +18265,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18410,6 +18526,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18432,6 +18549,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18711,6 +18829,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18736,6 +18855,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19018,6 +19138,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19043,6 +19164,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19306,6 +19428,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19326,6 +19449,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19584,6 +19708,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19604,6 +19729,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19881,6 +20007,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19906,6 +20033,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20188,6 +20316,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20213,6 +20342,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20495,6 +20625,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20520,6 +20651,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20802,6 +20934,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20827,6 +20960,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21109,6 +21243,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21134,6 +21269,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21416,6 +21552,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21441,6 +21578,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21723,6 +21861,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21748,6 +21887,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22030,6 +22170,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22055,6 +22196,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index a98efb49b4b72..692aee5f4b9ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -126,6 +126,7 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-WGP-LABEL: global_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -140,6 +141,7 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-LABEL: global_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -315,13 +317,16 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-WGP-LABEL: global_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS @@ -334,13 +339,16 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-LABEL: global_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS @@ -474,6 +482,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-WGP-LABEL: global_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -493,6 +502,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-CU-LABEL: global_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -665,13 +675,16 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-LABEL: global_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -687,13 +700,16 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-LABEL: global_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -833,6 +849,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -848,6 +865,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -967,6 +985,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -983,6 +1002,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-LABEL: global_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index f805e2cf37006..aaa11c0455606 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-WGP-LABEL: global_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -194,6 +195,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-CU-LABEL: global_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -378,6 +380,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-WGP-LABEL: global_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -390,6 +393,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-CU-LABEL: global_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -574,6 +578,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-WGP-LABEL: global_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -586,6 +591,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-CU-LABEL: global_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -770,6 +776,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -782,6 +789,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-CU-LABEL: global_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -940,6 +948,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-WGP-LABEL: global_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -951,6 +960,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-LABEL: global_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-WGP-LABEL: global_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-LABEL: global_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-WGP-LABEL: global_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-LABEL: global_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-LABEL: global_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-LABEL: global_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 30bf492071535..25c75aa50df09 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-WGP-LABEL: global_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -194,6 +195,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-CU-LABEL: global_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -378,6 +380,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-WGP-LABEL: global_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -390,6 +393,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-CU-LABEL: global_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -578,6 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-WGP-LABEL: global_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -593,6 +598,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-CU-LABEL: global_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -786,6 +792,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -805,6 +812,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-LABEL: global_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -964,6 +972,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-WGP-LABEL: global_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -975,6 +984,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-LABEL: global_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1131,6 +1141,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-WGP-LABEL: global_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1142,6 +1153,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-LABEL: global_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1311,6 +1323,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-WGP-LABEL: global_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1327,6 +1340,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-LABEL: global_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -1497,6 +1511,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1513,6 +1528,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-LABEL: global_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -3212,6 +3228,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3227,6 +3244,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3444,6 +3462,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3461,6 +3480,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3683,6 +3703,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3703,6 +3724,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3934,6 +3956,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3956,6 +3979,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4187,6 +4211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4209,6 +4234,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4427,6 +4453,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4444,6 +4471,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4661,6 +4689,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4678,6 +4707,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4908,6 +4938,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4930,6 +4961,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5161,6 +5193,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5183,6 +5216,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5414,6 +5448,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5436,6 +5471,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5667,6 +5703,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5689,6 +5726,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5920,6 +5958,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5942,6 +5981,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6173,6 +6213,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6195,6 +6236,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6426,6 +6468,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6448,6 +6491,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6679,6 +6723,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6701,6 +6746,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6936,6 +6982,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6953,6 +7000,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7195,6 +7243,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7215,6 +7264,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7466,6 +7516,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7488,6 +7539,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7744,6 +7796,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7769,6 +7822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8025,6 +8079,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8050,6 +8105,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8293,6 +8349,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8313,6 +8370,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8555,6 +8613,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8575,6 +8634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8830,6 +8890,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8855,6 +8916,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9111,6 +9173,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9136,6 +9199,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9392,6 +9456,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9417,6 +9482,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9673,6 +9739,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9698,6 +9765,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9954,6 +10022,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9979,6 +10048,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10235,6 +10305,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10260,6 +10331,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10516,6 +10588,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10541,6 +10614,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10797,6 +10871,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10822,6 +10897,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11014,6 +11090,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11026,6 +11103,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11210,6 +11288,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11222,6 +11301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11410,6 +11490,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11425,6 +11506,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11615,6 +11697,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11634,6 +11717,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 @@ -11792,6 +11876,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11803,6 +11888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -11959,6 +12045,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -11970,6 +12057,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12132,6 +12220,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -12148,6 +12237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-LABEL: global_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -12310,6 +12400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -12326,6 +12417,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -13984,6 +14076,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13999,6 +14092,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14216,6 +14310,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14233,6 +14328,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14448,6 +14544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14468,6 +14565,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14691,6 +14789,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14713,6 +14812,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14936,6 +15036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14958,6 +15059,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15175,6 +15277,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15192,6 +15295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15409,6 +15513,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15426,6 +15531,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15649,6 +15755,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15671,6 +15778,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15894,6 +16002,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15916,6 +16025,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16139,6 +16249,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16161,6 +16272,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16384,6 +16496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16406,6 +16519,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16629,6 +16743,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16651,6 +16766,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16874,6 +16990,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16896,6 +17013,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17119,6 +17237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17141,6 +17260,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17364,6 +17484,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17386,6 +17507,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17620,6 +17742,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17637,6 +17760,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17879,6 +18003,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17899,6 +18024,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18143,6 +18269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18165,6 +18292,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18413,6 +18541,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18438,6 +18567,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18686,6 +18816,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18711,6 +18842,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18953,6 +19085,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18973,6 +19106,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19215,6 +19349,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19235,6 +19370,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19483,6 +19619,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19508,6 +19645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19756,6 +19894,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19781,6 +19920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20029,6 +20169,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20054,6 +20195,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20302,6 +20444,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20327,6 +20470,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20575,6 +20719,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20600,6 +20745,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20848,6 +20994,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20873,6 +21020,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21121,6 +21269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21146,6 +21295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21394,6 +21544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21419,6 +21570,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll index 67ca31a2bb84e..1a2058cbe39e4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s 2>&1 | FileCheck %s diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 02cd97c9fe82a..d925ca52f8560 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index ba9711333a194..fce60ff12aed3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -183,6 +183,7 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-WGP-LABEL: local_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -196,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-CU-LABEL: local_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -420,13 +422,16 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-LABEL: local_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 @@ -437,13 +442,16 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-LABEL: local_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 @@ -615,6 +623,7 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-WGP-LABEL: local_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -628,6 +637,7 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-CU-LABEL: local_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -826,8 +836,10 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -841,8 +853,10 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1027,6 +1041,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-WGP-LABEL: local_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1040,6 +1055,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-CU-LABEL: local_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index fe5f2c51734f7..033c71574643c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 1c4c8d41b18f9..548c5aceb25f7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index a52dd9b340169..a8f7051bd5050 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -123,6 +123,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-WGP-LABEL: local_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -136,6 +137,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-CU-LABEL: local_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -284,13 +286,16 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-LABEL: local_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 @@ -301,13 +306,16 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-LABEL: local_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 @@ -423,6 +431,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-WGP-LABEL: local_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -441,6 +450,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-LABEL: local_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -576,8 +586,10 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -596,8 +608,10 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index 02e4e0d69dc20..694ffb2964f56 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index c242963228537..0cf644c006fac 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index 61cec731feb56..8e292fa592975 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -6,6 +6,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add ; GFX12-LABEL: private_last_use_load_0: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -24,13 +25,16 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-LABEL: private_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-NEXT: s_mov_b32 s3, 2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s3, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU @@ -49,6 +53,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) % ; GFX12-LABEL: private_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -69,6 +74,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5 ; GFX12-LABEL: private_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 4e08065e879fd..c3599c87985be 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -193,6 +193,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX12-WGP-LABEL: private_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -205,6 +206,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX12-CU-LABEL: private_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -442,13 +444,16 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-WGP-LABEL: private_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT @@ -459,13 +464,16 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-LABEL: private_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT @@ -648,6 +656,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX12-WGP-LABEL: private_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -660,6 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX12-CU-LABEL: private_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -868,13 +878,16 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-WGP-LABEL: private_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -884,13 +897,16 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-CU-LABEL: private_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1085,6 +1101,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX12-WGP-LABEL: private_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -1099,6 +1116,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX12-CU-LABEL: private_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index a68b5f36b806e..9146f175eefcd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -135,6 +135,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX12-WGP-LABEL: private_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 @@ -149,6 +150,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX12-CU-LABEL: private_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 @@ -312,13 +314,16 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-WGP-LABEL: private_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS @@ -331,13 +336,16 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-LABEL: private_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS @@ -475,6 +483,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX12-WGP-LABEL: private_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -493,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX12-CU-LABEL: private_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -646,13 +656,16 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-WGP-LABEL: private_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -668,13 +681,16 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-CU-LABEL: private_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index bbbbc0dc0f28d..a642543c3780d 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -11,8 +11,10 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) { ; GFX12-NEXT: s_add_f32 s0, s0, s1 ; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 ; GFX12-NEXT: v_s_exp_f32 s0, s0 -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog %result = call float @llvm.exp2.f32(float %src) @@ -61,8 +63,10 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) { ; GFX12-NEXT: s_mul_f32 s0, s0, s1 ; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 ; GFX12-NEXT: v_s_log_f32 s0, s0 -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog %result = call float @llvm.log2.f32(float %src) @@ -166,6 +170,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0 ; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_mov_b32 s4, s1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1 @@ -179,16 +184,19 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_xor_b32 s6, s4, 0x80000000 ; GFX12-SDAG-NEXT: s_fmac_f32 s5, s6, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-SDAG-NEXT: s_cmp_gt_f32 s5, 0 ; GFX12-SDAG-NEXT: s_cselect_b32 s2, s4, s3 ; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000 ; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000 ; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_cselect_b32 s0, s1, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; @@ -200,6 +208,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0 ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_mov_b32 s4, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -217,11 +226,11 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %src) @@ -270,10 +279,12 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { ; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 ; GFX12-NEXT: s_mul_f32 s0, s0, s1 ; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX12-NEXT: v_s_log_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog %abs = call float @llvm.fabs.f32(float %src) @@ -291,8 +302,10 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0 ; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0 ; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; @@ -304,10 +317,12 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 ; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %neg = fneg float %src diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index 4ea77d1d1ac15..b7aecca45def5 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -63,10 +63,11 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v2, s30, 0 ; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s1, s1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ;;#ASMEND @@ -74,16 +75,19 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_readlane_b32 s31, v2, 1 ; GFX12-NEXT: v_readlane_b32 s30, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i64 @llvm.amdgcn.s.getpc() diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index 1f36f7a0d9616..d4f7bf656d3b5 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -62,12 +62,14 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: swap: @@ -80,17 +82,19 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 ; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index 65feaf23ae2cb..ab222f4feeef0 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -45,18 +45,12 @@ name: mask_hazard_getpc1 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_getpc1 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_getpc1 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_getpc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec $sgpr0_sgpr1 = S_GETPC_B64 $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -67,24 +61,15 @@ body: | name: mask_hazard_getpc2 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_getpc2 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc - ; GFX11-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc - ; GFX11-NEXT: } - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_getpc2 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc - ; GFX12-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc - ; GFX12-NEXT: } - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_getpc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec BUNDLE implicit-def $sgpr0_sgpr1 { $sgpr0_sgpr1 = S_GETPC_B64 @@ -523,18 +508,12 @@ body: | name: mask_hazard_subreg4 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_subreg4 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_subreg4 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0 - ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_subreg4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec $vcc_lo = S_MOV_B32 0 $sgpr2 = S_MOV_B32 $vcc_lo @@ -546,18 +525,12 @@ body: | name: mask_hazard_subreg5 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_subreg5 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_subreg5 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0 - ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_subreg5 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_hi = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec $vcc_hi = S_MOV_B32 0 $sgpr2 = S_MOV_B32 $vcc_hi @@ -569,20 +542,13 @@ body: | name: mask_hazard_waitcnt body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_waitcnt - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: S_WAITCNT 0 - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_waitcnt - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: S_WAITCNT 0 - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_waitcnt + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec S_WAITCNT 0 $sgpr0_sgpr1 = S_GETPC_B64 @@ -595,22 +561,14 @@ body: | name: mask_hazard_gap1 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_gap1 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_gap1 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec - ; GFX12-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_gap1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -624,20 +582,13 @@ body: | name: mask_hazard_gap2 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_gap2 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_gap2 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_gap2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode $sgpr0_sgpr1 = S_GETPC_B64 @@ -650,20 +601,13 @@ body: | name: mask_hazard_gap3 body: | bb.0: - ; GFX11-LABEL: name: mask_hazard_gap3 - ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 - ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 - ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: mask_hazard_gap3 - ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec - ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 - ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GFX12-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: mask_hazard_gap3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 $sgpr0_sgpr1 = S_GETPC_B64 diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir new file mode 100644 index 0000000000000..2aa16dd904766 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir @@ -0,0 +1,862 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s +# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s + +--- | + @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] + + define amdgpu_gs void @hazard_getpc1() { ret void } + define amdgpu_gs void @hazard_getpc2() { ret void } + define amdgpu_gs void @hazard_getpc3() { ret void } + define amdgpu_gs void @hazard_getpc4() { ret void } + define amdgpu_gs void @hazard_vcc1() { ret void } + define amdgpu_gs void @hazard_vcc2() { ret void } + define amdgpu_gs void @hazard_vcc3() { ret void } + define amdgpu_gs void @hazard_addc1() { ret void } + define amdgpu_gs void @hazard_addc2() { ret void } + define amdgpu_gs void @hazard_addc3() { ret void } + define amdgpu_gs void @hazard_addc4() { ret void } + define amdgpu_gs void @hazard_addc5() { ret void } + define amdgpu_gs void @hazard_addc6() { ret void } + define amdgpu_gs void @hazard_vaddc1() { ret void } + define amdgpu_gs void @hazard_gap1() { ret void } + define amdgpu_gs void @hazard_gap2() { ret void } + define amdgpu_gs void @hazard_gap3() { ret void } + define amdgpu_gs void @hazard_gap4_no_hazard() { ret void } + define amdgpu_gs void @hazard_valu_write1_no_hazard() { ret void } + define amdgpu_gs void @hazard_post_order1() { ret void } + define amdgpu_gs void @hazard_post_order2() { ret void } + define amdgpu_gs void @hazard_post_order_cycle() { ret void } + define amdgpu_cs void @hazard_calls() { ret void } +... + +--- +name: hazard_getpc1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_getpc1 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_getpc1 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_getpc2 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_getpc2 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_getpc2 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_getpc3 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_getpc3 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc + ; GCN-O0-NEXT: } + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_getpc3 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc + ; GCN-O2-NEXT: } + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + BUNDLE implicit-def $sgpr0_sgpr1 { + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc + $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 12, implicit-def $scc, implicit $scc + } + S_ENDPGM 0 +... + +--- +name: hazard_getpc4 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_getpc4 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc + ; GCN-O0-NEXT: } + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_getpc4 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 + ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc + ; GCN-O2-NEXT: } + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + BUNDLE implicit-def $sgpr0_sgpr1 { + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr1 = S_SEXT_I32_I16 $sgpr1 + $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc + } + S_ENDPGM 0 +... + +--- +name: hazard_vcc1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_vcc1 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec + ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_vcc1 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec + ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec + $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_vcc2 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_vcc2 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_vcc2 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_vcc3 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_vcc3 + ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_vcc3 + ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_addc1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc1 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc1 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_addc2 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc2 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc2 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_addc3 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc3 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc3 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_addc4 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc4 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc4 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec + $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_addc5 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc5 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc5 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr16 = S_MOV_B32 0 + $sgpr32 = S_MOV_B32 0 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_addc6 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_addc6 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_addc6 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0 + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr16 = S_MOV_B32 0 + $sgpr32 = S_MOV_B32 0 + $sgpr48 = S_MOV_B32 0 + $sgpr80 = S_MOV_B32 0 + $sgpr96 = S_MOV_B32 0 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_vaddc1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_vaddc1 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_vaddc1 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_gap1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_gap1 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_gap1 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_gap2 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_gap2 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_gap2 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_gap3 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_gap3 + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_gap3 + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_gap4_no_hazard +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_gap4_no_hazard + ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_gap4_no_hazard + ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc + $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc + $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_valu_write1_no_hazard +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard + ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard + ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-O2-NEXT: S_ENDPGM 0 + $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_post_order1 +body: | + bb.0: + ; GCN-O0-LABEL: name: hazard_post_order1 + ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_post_order1 + ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: S_ENDPGM 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_post_order2 +body: | + ; GCN-O0-LABEL: name: hazard_post_order2 + ; GCN-O0: bb.0: + ; GCN-O0-NEXT: successors: %bb.1(0x80000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_BRANCH %bb.1 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.1: + ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_post_order2 + ; GCN-O2: bb.0: + ; GCN-O2-NEXT: successors: %bb.1(0x80000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: S_BRANCH %bb.1 + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.1: + ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: S_ENDPGM 0 + bb.0: + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_post_order_cycle +body: | + ; GCN-O0-LABEL: name: hazard_post_order_cycle + ; GCN-O0: bb.0: + ; GCN-O0-NEXT: successors: %bb.1(0x80000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: S_NOP 0 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.1: + ; GCN-O0-NEXT: successors: %bb.2(0x80000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.2: + ; GCN-O0-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O0-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.3: + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_post_order_cycle + ; GCN-O2: bb.0: + ; GCN-O2-NEXT: successors: %bb.1(0x80000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: S_NOP 0 + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.1: + ; GCN-O2-NEXT: successors: %bb.2(0x80000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.2: + ; GCN-O2-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-O2-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.3: + ; GCN-O2-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0 + + bb.1: + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + + bb.2: + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + S_CBRANCH_SCC0 %bb.1, implicit $scc + + bb.3: + S_ENDPGM 0 +... + +--- +name: hazard_calls +frameInfo: + hasCalls: true +body: | + ; GCN-O0-LABEL: name: hazard_calls + ; GCN-O0: bb.0: + ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.1: + ; GCN-O0-NEXT: $sgpr18 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.2: + ; GCN-O0-NEXT: successors: %bb.3(0x80000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: $sgpr20 = S_MOV_B32 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.3: + ; GCN-O0-NEXT: successors: %bb.4(0x80000000) + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: {{ $}} + ; GCN-O0-NEXT: bb.4: + ; GCN-O0-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O0-NEXT: S_ENDPGM 0 + ; + ; GCN-O2-LABEL: name: hazard_calls + ; GCN-O2: bb.0: + ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.1: + ; GCN-O2-NEXT: $sgpr18 = S_MOV_B32 0 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.2: + ; GCN-O2-NEXT: successors: %bb.3(0x80000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: $sgpr20 = S_MOV_B32 0 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.3: + ; GCN-O2-NEXT: successors: %bb.4(0x80000000) + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; GCN-O2-NEXT: {{ $}} + ; GCN-O2-NEXT: bb.4: + ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-O2-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; GCN-O2-NEXT: S_ENDPGM 0 + bb.0: + $sgpr16 = S_MOV_B32 0 + S_SETPC_B64 $sgpr0_sgpr1 + + bb.1: + $sgpr18 = S_MOV_B32 0 + S_SETPC_B64_return $sgpr0_sgpr1 + + bb.2: + $sgpr20 = S_MOV_B32 0 + $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + + bb.3: + $sgpr8_sgpr9 = S_CALL_B64 0 + + bb.4: + $sgpr22 = S_MOV_B32 $sgpr8 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir index 3f40a57ca1491..e3b96c08348fc 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -1,11 +1,12 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s # GCN-LABEL: name: hazard_vcmpx_permlane16 # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16 @@ -128,6 +129,7 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec +# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16_undef_src @@ -150,6 +152,7 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e64 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_e64_permlane16 From a5ce66423bfff6f2185e5fe48bc6ffc0ade7df4d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 3 Sep 2024 20:13:40 -0700 Subject: [PATCH 025/425] [RISCV] Remove RISCVISD::FP_ROUND_BF16. Use isel patterns on regular FP_ROUND. For double->bf16 we need to emit two instructions. Note the double->bf16 conversion does double rounding, but I don't know a good way to fix that. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 27 ++----------------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 -- .../lib/Target/RISCV/RISCVInstrInfoZfbfmin.td | 16 ++++------- 3 files changed, 7 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 11be5c25354f1..3742b897ca568 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -447,7 +447,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::bf16, Custom); - setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand); setOperationAction(ISD::SELECT, MVT::bf16, Custom); @@ -6631,30 +6630,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, !Subtarget.hasVInstructionsF16())) return SplitVectorOp(Op, DAG); return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget); - case ISD::FP_EXTEND: { - if (!Op.getValueType().isVector()) - return Op; + case ISD::FP_EXTEND: + case ISD::FP_ROUND: return lowerVectorFPExtendOrRoundLike(Op, DAG); - } - case ISD::FP_ROUND: { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Op0 = Op.getOperand(0); - EVT Op0VT = Op0.getValueType(); - if (VT == MVT::bf16 && Op0VT == MVT::f32 && Subtarget.hasStdExtZfbfmin()) - return DAG.getNode(RISCVISD::FP_ROUND_BF16, DL, MVT::bf16, Op0); - if (VT == MVT::bf16 && Op0VT == MVT::f64 && Subtarget.hasStdExtZfbfmin() && - Subtarget.hasStdExtDOrZdinx()) { - SDValue FloatVal = - DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Op0, - DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)); - return DAG.getNode(RISCVISD::FP_ROUND_BF16, DL, MVT::bf16, FloatVal); - } - - if (!Op.getValueType().isVector()) - return Op; - return lowerVectorFPExtendOrRoundLike(Op, DAG); - } case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: return lowerStrictFPExtendOrRoundLike(Op, DAG); @@ -20588,7 +20566,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FCVT_WU_RV64) NODE_NAME_CASE(STRICT_FCVT_W_RV64) NODE_NAME_CASE(STRICT_FCVT_WU_RV64) - NODE_NAME_CASE(FP_ROUND_BF16) NODE_NAME_CASE(FROUND) NODE_NAME_CASE(FCLASS) NODE_NAME_CASE(FSGNJX) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 29a16282ed001..f1d1cca043b35 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -116,8 +116,6 @@ enum NodeType : unsigned { FCVT_W_RV64, FCVT_WU_RV64, - FP_ROUND_BF16, - // Rounds an FP value to its corresponding integer in the same FP format. // First operand is the value to round, the second operand is the largest // integer that can be represented exactly in the FP format. This will be diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td index fd618a938ae8c..8f0768f91c370 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td @@ -13,16 +13,6 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// RISC-V specific DAG Nodes. -//===----------------------------------------------------------------------===// - -def SDT_RISCVFP_ROUND_BF16 - : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, f32>]>; - -def riscv_fpround_bf16 - : SDNode<"RISCVISD::FP_ROUND_BF16", SDT_RISCVFP_ROUND_BF16>; - //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -60,7 +50,7 @@ def : StPat; /// Float conversion operations // f32 -> bf16, bf16 -> f32 -def : Pat<(bf16 (riscv_fpround_bf16 FPR32:$rs1)), +def : Pat<(bf16 (fpround FPR32:$rs1)), (FCVT_BF16_S FPR32:$rs1, FRM_DYN)>; def : Pat<(fpextend (bf16 FPR16:$rs1)), (FCVT_S_BF16 FPR16:$rs1, FRM_RNE)>; @@ -72,6 +62,10 @@ def : Pat<(riscv_fmv_x_signexth (bf16 FPR16:$src)), (FMV_X_H FPR16:$src)>; } // Predicates = [HasStdExtZfbfmin] let Predicates = [HasStdExtZfbfmin, HasStdExtD] in { +// f64 -> bf16 +// FIXME: This pattern double rounds. +def : Pat<(bf16 (fpround FPR64:$rs1)), + (FCVT_BF16_S (FCVT_S_D FPR64:$rs1, FRM_DYN), FRM_DYN)>; // bf16 -> f64 def : Pat<(fpextend (bf16 FPR16:$rs1)), (FCVT_D_S (FCVT_S_BF16 FPR16:$rs1, FRM_DYN), FRM_RNE)>; From 0ad6cee926865d7210eed9e67bfb20dce19c6633 Mon Sep 17 00:00:00 2001 From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:29:50 +0800 Subject: [PATCH 026/425] [RISCV] Fix missing `i64` to `double` tests in the cast.ll. (NFC) (#106972) --- llvm/test/Analysis/CostModel/RISCV/cast.ll | 168 ++++++++++----------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index 1fc4b3b346020..c1759b8b03d0f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -2613,7 +2613,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> @@ -2623,7 +2623,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> @@ -2633,7 +2633,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> @@ -2643,7 +2643,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> @@ -2653,7 +2653,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> @@ -2663,7 +2663,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> @@ -2673,7 +2673,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to @@ -2683,7 +2683,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to @@ -2693,7 +2693,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp undef to @@ -2703,7 +2703,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp undef to @@ -2713,7 +2713,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp undef to @@ -2723,7 +2723,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp undef to @@ -2733,7 +2733,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp undef to @@ -2743,7 +2743,7 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -2756,7 +2756,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> @@ -2766,7 +2766,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> @@ -2776,7 +2776,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> @@ -2786,7 +2786,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> @@ -2796,7 +2796,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> @@ -2806,7 +2806,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> @@ -2816,7 +2816,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to @@ -2826,7 +2826,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to @@ -2836,7 +2836,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp undef to @@ -2846,7 +2846,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp undef to @@ -2856,7 +2856,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp undef to @@ -2866,7 +2866,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp undef to @@ -2876,7 +2876,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp undef to @@ -2886,7 +2886,7 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -2898,7 +2898,7 @@ define void @sitofp() { %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> - %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> + %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> @@ -2909,7 +2909,7 @@ define void @sitofp() { %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> - %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> + %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> @@ -2920,7 +2920,7 @@ define void @sitofp() { %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> - %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> + %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> @@ -2931,7 +2931,7 @@ define void @sitofp() { %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> - %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> + %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double> %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> @@ -2942,7 +2942,7 @@ define void @sitofp() { %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> - %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> + %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double> %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> @@ -2953,7 +2953,7 @@ define void @sitofp() { %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> - %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> + %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double> %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> @@ -2964,7 +2964,7 @@ define void @sitofp() { %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> - %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> + %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double> %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> @@ -2975,7 +2975,7 @@ define void @sitofp() { %nxv1i32_nxv1f32 = sitofp undef to %nxv1i32_nxv1f64 = sitofp undef to %nxv1i64_nxv1f32 = sitofp undef to - %nxv1i64_nxv1f64 = sitofp undef to + %nxv1i64_nxv1f64 = sitofp undef to %nxv1i1_nxv1f32 = sitofp undef to %nxv1i1_nxv1f64 = sitofp undef to @@ -2986,7 +2986,7 @@ define void @sitofp() { %nxv2i32_nxv2f32 = sitofp undef to %nxv2i32_nxv2f64 = sitofp undef to %nxv2i64_nxv2f32 = sitofp undef to - %nxv2i64_nxv2f64 = sitofp undef to + %nxv2i64_nxv2f64 = sitofp undef to %nxv2i1_nxv2f32 = sitofp undef to %nxv2i1_nxv2f64 = sitofp undef to @@ -2997,7 +2997,7 @@ define void @sitofp() { %nxv4i32_nxv4f32 = sitofp undef to %nxv4i32_nxv4f64 = sitofp undef to %nxv4i64_nxv4f32 = sitofp undef to - %nxv4i64_nxv4f64 = sitofp undef to + %nxv4i64_nxv4f64 = sitofp undef to %nxv4i1_nxv4f32 = sitofp undef to %nxv4i1_nxv4f64 = sitofp undef to @@ -3008,7 +3008,7 @@ define void @sitofp() { %nxv8i32_nxv8f32 = sitofp undef to %nxv8i32_nxv8f64 = sitofp undef to %nxv8i64_nxv8f32 = sitofp undef to - %nxv8i64_nxv8f64 = sitofp undef to + %nxv8i64_nxv8f64 = sitofp undef to %nxv8i1_nxv8f32 = sitofp undef to %nxv8i1_nxv8f64 = sitofp undef to @@ -3019,7 +3019,7 @@ define void @sitofp() { %nxv16i32_nxv16f32 = sitofp undef to %nxv16i32_nxv16f64 = sitofp undef to %nxv16i64_nxv16f32 = sitofp undef to - %nxv16i64_nxv16f64 = sitofp undef to + %nxv16i64_nxv16f64 = sitofp undef to %nxv16i1_nxv16f32 = sitofp undef to %nxv16i1_nxv16f64 = sitofp undef to @@ -3030,7 +3030,7 @@ define void @sitofp() { %nxv32i32_nxv32f32 = sitofp undef to %nxv32i32_nxv32f64 = sitofp undef to %nxv32i64_nxv32f32 = sitofp undef to - %nxv32i64_nxv32f64 = sitofp undef to + %nxv32i64_nxv32f64 = sitofp undef to %nxv32i1_nxv32f32 = sitofp undef to %nxv32i1_nxv32f64 = sitofp undef to @@ -3041,7 +3041,7 @@ define void @sitofp() { %nxv64i32_nxv64f32 = sitofp undef to %nxv64i32_nxv64f64 = sitofp undef to %nxv64i64_nxv64f32 = sitofp undef to - %nxv64i64_nxv64f64 = sitofp undef to + %nxv64i64_nxv64f64 = sitofp undef to %nxv64i1_nxv64f32 = sitofp undef to %nxv64i1_nxv64f64 = sitofp undef to @@ -3057,7 +3057,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> @@ -3067,7 +3067,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> @@ -3077,7 +3077,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> @@ -3087,7 +3087,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> @@ -3097,7 +3097,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> @@ -3107,7 +3107,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> @@ -3117,7 +3117,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> -; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to @@ -3127,7 +3127,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to @@ -3137,7 +3137,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp undef to @@ -3147,7 +3147,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp undef to @@ -3157,7 +3157,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp undef to @@ -3167,7 +3167,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp undef to @@ -3177,7 +3177,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp undef to @@ -3187,7 +3187,7 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -3200,7 +3200,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> @@ -3210,7 +3210,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> @@ -3220,7 +3220,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> @@ -3230,7 +3230,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> @@ -3240,7 +3240,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> @@ -3250,7 +3250,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> @@ -3260,7 +3260,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> -; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to @@ -3270,7 +3270,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to @@ -3280,7 +3280,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp undef to @@ -3290,7 +3290,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp undef to @@ -3300,7 +3300,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp undef to @@ -3310,7 +3310,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp undef to @@ -3320,7 +3320,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp undef to @@ -3330,7 +3330,7 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -3342,7 +3342,7 @@ define void @uitofp() { %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> - %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> + %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> @@ -3353,7 +3353,7 @@ define void @uitofp() { %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> - %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> + %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> @@ -3364,7 +3364,7 @@ define void @uitofp() { %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> - %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> + %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> @@ -3375,7 +3375,7 @@ define void @uitofp() { %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> - %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> + %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double> %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> @@ -3386,7 +3386,7 @@ define void @uitofp() { %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> - %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> + %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double> %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> @@ -3397,7 +3397,7 @@ define void @uitofp() { %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> - %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> + %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double> %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> @@ -3408,7 +3408,7 @@ define void @uitofp() { %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> - %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> + %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double> %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> @@ -3419,7 +3419,7 @@ define void @uitofp() { %nxv1i32_nxv1f32 = uitofp undef to %nxv1i32_nxv1f64 = uitofp undef to %nxv1i64_nxv1f32 = uitofp undef to - %nxv1i64_nxv1f64 = uitofp undef to + %nxv1i64_nxv1f64 = uitofp undef to %nxv1i1_nxv1f32 = uitofp undef to %nxv1i1_nxv1f64 = uitofp undef to @@ -3430,7 +3430,7 @@ define void @uitofp() { %nxv2i32_nxv2f32 = uitofp undef to %nxv2i32_nxv2f64 = uitofp undef to %nxv2i64_nxv2f32 = uitofp undef to - %nxv2i64_nxv2f64 = uitofp undef to + %nxv2i64_nxv2f64 = uitofp undef to %nxv2i1_nxv2f32 = uitofp undef to %nxv2i1_nxv2f64 = uitofp undef to @@ -3441,7 +3441,7 @@ define void @uitofp() { %nxv4i32_nxv4f32 = uitofp undef to %nxv4i32_nxv4f64 = uitofp undef to %nxv4i64_nxv4f32 = uitofp undef to - %nxv4i64_nxv4f64 = uitofp undef to + %nxv4i64_nxv4f64 = uitofp undef to %nxv4i1_nxv4f32 = uitofp undef to %nxv4i1_nxv4f64 = uitofp undef to @@ -3452,7 +3452,7 @@ define void @uitofp() { %nxv8i32_nxv8f32 = uitofp undef to %nxv8i32_nxv8f64 = uitofp undef to %nxv8i64_nxv8f32 = uitofp undef to - %nxv8i64_nxv8f64 = uitofp undef to + %nxv8i64_nxv8f64 = uitofp undef to %nxv8i1_nxv8f32 = uitofp undef to %nxv8i1_nxv8f64 = uitofp undef to @@ -3463,7 +3463,7 @@ define void @uitofp() { %nxv16i32_nxv16f32 = uitofp undef to %nxv16i32_nxv16f64 = uitofp undef to %nxv16i64_nxv16f32 = uitofp undef to - %nxv16i64_nxv16f64 = uitofp undef to + %nxv16i64_nxv16f64 = uitofp undef to %nxv16i1_nxv16f32 = uitofp undef to %nxv16i1_nxv16f64 = uitofp undef to @@ -3474,7 +3474,7 @@ define void @uitofp() { %nxv32i32_nxv32f32 = uitofp undef to %nxv32i32_nxv32f64 = uitofp undef to %nxv32i64_nxv32f32 = uitofp undef to - %nxv32i64_nxv32f64 = uitofp undef to + %nxv32i64_nxv32f64 = uitofp undef to %nxv32i1_nxv32f32 = uitofp undef to %nxv32i1_nxv32f64 = uitofp undef to @@ -3485,7 +3485,7 @@ define void @uitofp() { %nxv64i32_nxv64f32 = uitofp undef to %nxv64i32_nxv64f64 = uitofp undef to %nxv64i64_nxv64f32 = uitofp undef to - %nxv64i64_nxv64f64 = uitofp undef to + %nxv64i64_nxv64f64 = uitofp undef to %nxv64i1_nxv64f32 = uitofp undef to %nxv64i1_nxv64f64 = uitofp undef to From 8b28e2ebb36d72cfffe04904e3e1b9fdfa36ef94 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 3 Sep 2024 21:14:36 -0700 Subject: [PATCH 027/425] [WebAssembly] Rename legacy EH tests (#107166) Give each test in `cfg-stackify-eh-legacy.ll` a name rather than something like `test5`, because I plan to copy many of these test into a new file that tests for the new EH (exnref) and some of the tests here are not applicable to the new EH so the numbering will be different, which can make things confusing. Also this removes `test_` prefixes in the test function names in `exception-legacy.ll`, because, well, we all know they are tests. --- .../WebAssembly/cfg-stackify-eh-legacy.ll | 122 +++++++++--------- .../CodeGen/WebAssembly/exception-legacy.ll | 40 +++--- 2 files changed, 82 insertions(+), 80 deletions(-) diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll index 21a0949debc12..cef92f459e4aa 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll @@ -16,7 +16,7 @@ target triple = "wasm32-unknown-unknown" ; Simple test case with two catch clauses ; ; void foo(); -; void test0() { +; void two_catches() { ; try { ; foo(); ; } catch (int) { @@ -24,7 +24,7 @@ target triple = "wasm32-unknown-unknown" ; } ; } -; CHECK-LABEL: test0: +; CHECK-LABEL: two_catches: ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -42,7 +42,7 @@ target triple = "wasm32-unknown-unknown" ; CHECK: end_block # label[[L2]]: ; CHECK: rethrow 0 # to caller ; CHECK: end_try # label[[L1]]: -define void @test0() personality ptr @__gxx_wasm_personality_v0 { +define void @two_catches() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -82,7 +82,7 @@ try.cont: ; preds = %catch, %catch2, %en } ; Nested try-catches within a catch -; void test1() { +; void nested_catch() { ; try { ; foo(); ; } catch (int) { @@ -94,7 +94,7 @@ try.cont: ; preds = %catch, %catch2, %en ; } ; } -; CHECK-LABEL: test1: +; CHECK-LABEL: nested_catch: ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -133,7 +133,7 @@ try.cont: ; preds = %catch, %catch2, %en ; CHECK: end_block # label[[L1]]: ; CHECK: call __cxa_end_catch ; CHECK: end_try -define void @test1() personality ptr @__gxx_wasm_personality_v0 { +define void @nested_catch() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont11 unwind label %catch.dispatch @@ -206,7 +206,7 @@ unreachable: ; preds = %rethrow5 } ; Nested loop within a catch clause -; void test2() { +; void loop_within_catch() { ; try { ; foo(); ; } catch (...) { @@ -215,7 +215,7 @@ unreachable: ; preds = %rethrow5 ; } ; } -; CHECK-LABEL: test2: +; CHECK-LABEL: loop_within_catch: ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -243,7 +243,7 @@ unreachable: ; preds = %rethrow5 ; CHECK: br 0 # 0: up to label[[L0]] ; CHECK: end_loop ; CHECK: end_try # label[[L3]]: -define void @test2() personality ptr @__gxx_wasm_personality_v0 { +define void @loop_within_catch() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -297,7 +297,7 @@ terminate: ; preds = %ehcleanup ; TRY marker should be placed at bb0 because there's a branch from bb0 to bb2, ; and scopes cannot be interleaved. -; NOOPT-LABEL: test3: +; NOOPT-LABEL: block_try_markers: ; NOOPT: try ; NOOPT: block ; NOOPT: block @@ -309,7 +309,7 @@ terminate: ; preds = %ehcleanup ; NOOPT: call bar ; NOOPT: catch {{.*}} ; NOOPT: end_try -define void @test3() personality ptr @__gxx_wasm_personality_v0 { +define void @block_try_markers() personality ptr @__gxx_wasm_personality_v0 { bb0: br i1 undef, label %bb1, label %bb2 @@ -343,13 +343,14 @@ try.cont: ; preds = %catch.start, %bb4, ; Tests if try/end_try markers are placed correctly wrt loop/end_loop markers, ; when try and loop markers are in the same BB and end_try and end_loop are in ; another BB. +; CHECK-LABEL: loop_try_markers: ; CHECK: loop ; CHECK: try ; CHECK: call foo ; CHECK: catch ; CHECK: end_try ; CHECK: end_loop -define void @test4(ptr %p) personality ptr @__gxx_wasm_personality_v0 { +define void @loop_try_markers(ptr %p) personality ptr @__gxx_wasm_personality_v0 { entry: store volatile i32 0, ptr %p br label %loop @@ -388,7 +389,7 @@ try.cont: ; preds = %catch.start, %loop ; try-catch with try-delegate that rethrows an exception to the caller to fix ; this. -; NOSORT-LABEL: test5: +; NOSORT-LABEL: unwind_mismatches_0: ; NOSORT: try ; --- try-delegate starts (catch unwind mismatch) ; NOSORT try @@ -407,7 +408,7 @@ try.cont: ; preds = %catch.start, %loop ; NOSORT: end_try ; NOSORT: return -define void @test5() personality ptr @__gxx_wasm_personality_v0 { +define void @unwind_mismatches_0() personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -446,7 +447,7 @@ try.cont: ; preds = %catch.start1, %catc ; And the return value of 'baz' should NOT be stackified because the BB is split ; during fixing unwind mismatches. -; NOSORT-LABEL: test6: +; NOSORT-LABEL: unwind_mismatches_1: ; NOSORT: try ; NOSORT: call foo ; --- try-delegate starts (call unwind mismatch) @@ -462,7 +463,7 @@ try.cont: ; preds = %catch.start1, %catc ; NOSORT: return ; NOSORT: end_try -define void @test6() personality ptr @__gxx_wasm_personality_v0 { +define void @unwind_mismatches_1() personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -486,11 +487,11 @@ try.cont: ; preds = %catch.start0 ret void } -; The same as test5, but we have one more call 'call @foo' in bb1 which unwinds -; to the caller. IN this case bb1 has two call unwind mismatches: 'call @foo' -; unwinds to the caller and 'call @bar' unwinds to catch C0. +; The same as unwind_mismatches_0, but we have one more call 'call @foo' in bb1 +; which unwinds to the caller. IN this case bb1 has two call unwind mismatches: + ; 'call @foo' unwinds to the caller and 'call @bar' unwinds to catch C0. -; NOSORT-LABEL: test7: +; NOSORT-LABEL: unwind_mismatches_2: ; NOSORT: try ; --- try-delegate starts (catch unwind mismatch) ; NOSORT try @@ -514,7 +515,7 @@ try.cont: ; preds = %catch.start0 ; NOSORT: end_try ; NOSORT: return -define void @test7() personality ptr @__gxx_wasm_personality_v0 { +define void @unwind_mismatches_2() personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -546,16 +547,17 @@ try.cont: ; preds = %catch.start1, %catc ret void } -; Similar situation as @test6. Here 'call @qux''s original unwind destination -; was the caller, but after control flow linearization, their unwind destination -; incorrectly becomes 'C0' within the function. We fix this by wrapping the call -; with a nested try-delegate that rethrows the exception to the caller. +; Similar situation as @unwind_mismatches_1. Here 'call @qux''s original unwind +; destination was the caller, but after control flow linearization, their unwind +; destination incorrectly becomes 'C0' within the function. We fix this by +; wrapping the call with a nested try-delegate that rethrows the exception to +; the caller. ; Because 'call @qux' pops an argument pushed by 'i32.const 5' from stack, the ; nested 'try' should be placed before `i32.const 5', not between 'i32.const 5' ; and 'call @qux'. -; NOSORT-LABEL: test8: +; NOSORT-LABEL: unwind_mismatches_3: ; NOSORT: try i32 ; NOSORT: call foo ; --- try-delegate starts (call unwind mismatch) @@ -569,7 +571,7 @@ try.cont: ; preds = %catch.start1, %catc ; NOSORT: return ; NOSORT: end_try -define i32 @test8() personality ptr @__gxx_wasm_personality_v0 { +define i32 @unwind_mismatches_3() personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -594,8 +596,8 @@ try.cont: ; preds = %catch.start0 ; Tests the case when TEE stackifies a register in RegStackify but it gets ; unstackified in fixCallUnwindMismatches in CFGStackify. -; NOSORT-LOCALS-LABEL: test9: -define void @test9(i32 %x) personality ptr @__gxx_wasm_personality_v0 { +; NOSORT-LOCALS-LABEL: unstackify_when_fixing_unwind_mismatch: +define void @unstackify_when_fixing_unwind_mismatch(i32 %x) personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -640,7 +642,7 @@ try.cont: ; preds = %catch.start0 ; first catch because it is a non-C++ exception, it shouldn't unwind to the next ; catch, but it should unwind to the caller. -; NOSORT-LABEL: test10: +; NOSORT-LABEL: unwind_mismatches_4: ; NOSORT: try ; --- try-delegate starts (catch unwind mismatch) ; NOSORT: try @@ -667,7 +669,7 @@ try.cont: ; preds = %catch.start0 ; NOSORT: end_try ; NOSORT: return -define void @test10() personality ptr @__gxx_wasm_personality_v0 { +define void @unwind_mismatches_4() personality ptr @__gxx_wasm_personality_v0 { bb0: invoke void @foo() to label %bb1 unwind label %catch.dispatch0 @@ -709,7 +711,7 @@ try.cont: ; preds = %catch.start1, %catc ; (before 'cont' is sorted) and there should not be any unwind destination ; mismatches in CFGStackify. -; NOOPT-LABEL: test11: +; NOOPT-LABEL: cfg_sort_order: ; NOOPT: block ; NOOPT: try ; NOOPT: call foo @@ -718,7 +720,7 @@ try.cont: ; preds = %catch.start1, %catc ; NOOPT: call foo ; NOOPT: end_block ; NOOPT: return -define void @test11(i32 %arg) personality ptr @__gxx_wasm_personality_v0 { +define void @cfg_sort_order(i32 %arg) personality ptr @__gxx_wasm_personality_v0 { entry: %tobool = icmp ne i32 %arg, 0 br i1 %tobool, label %if.then, label %if.end @@ -753,7 +755,7 @@ if.end: ; preds = %cont, %catch.start, ; invoke.cont BB fall within try~end_try, but they shouldn't cause crashes or ; unwinding destination mismatches in CFGStackify. -; NOSORT-LABEL: test12: +; NOSORT-LABEL: mem_intrinsics: ; NOSORT: try ; NOSORT: call foo ; NOSORT: call {{.*}} memcpy @@ -763,7 +765,7 @@ if.end: ; preds = %cont, %catch.start, ; NOSORT: catch_all ; NOSORT: rethrow 0 ; NOSORT: end_try -define void @test12(ptr %a, ptr %b) personality ptr @__gxx_wasm_personality_v0 { +define void @mem_intrinsics(ptr %a, ptr %b) personality ptr @__gxx_wasm_personality_v0 { entry: %o = alloca %class.Object, align 1 invoke void @foo() @@ -787,11 +789,11 @@ ehcleanup: ; preds = %entry ; 'nothrow_i32' and 'fun', because the return value of 'nothrow_i32' is ; stackified and pushed onto the stack to be consumed by the call to 'fun'. -; CHECK-LABEL: test13: +; CHECK-LABEL: try_marker_with_stackified_input: ; CHECK: try ; CHECK: call $push{{.*}}=, nothrow_i32 ; CHECK: call fun, $pop{{.*}} -define void @test13() personality ptr @__gxx_wasm_personality_v0 { +define void @try_marker_with_stackified_input() personality ptr @__gxx_wasm_personality_v0 { entry: %call = call i32 @nothrow_i32() invoke void @fun(i32 %call) @@ -809,7 +811,7 @@ terminate: ; preds = %entry ; This crashed on debug mode (= when NDEBUG is not defined) when the logic for ; computing the innermost region was not correct, in which a loop region ; contains an exception region. This should pass CFGSort without crashing. -define void @test14() personality ptr @__gxx_wasm_personality_v0 { +define void @loop_exception_region() personality ptr @__gxx_wasm_personality_v0 { entry: %e = alloca %class.MyClass, align 4 br label %for.cond @@ -886,8 +888,8 @@ terminate7: ; preds = %ehcleanup ; ... ; bb2: <- Continuation BB ; end -; CHECK-LABEL: test15: -define void @test15(i32 %n) personality ptr @__gxx_wasm_personality_v0 { +; CHECK-LABEL: remove_unnecessary_instrs: +define void @remove_unnecessary_instrs(i32 %n) personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %for.body unwind label %catch.dispatch @@ -925,7 +927,7 @@ try.cont: ; preds = %catch.start, %for.e } ; void foo(); -; void test16() { +; void remove_unnecessary_br() { ; try { ; foo(); ; try { @@ -955,8 +957,8 @@ try.cont: ; preds = %catch.start, %for.e ; bb3: <- Continuation BB ; end ; -; CHECK-LABEL: test16: -define void @test16() personality ptr @__gxx_wasm_personality_v0 { +; CHECK-LABEL: remove_unnecessary_br: +define void @remove_unnecessary_br() personality ptr @__gxx_wasm_personality_v0 { ; CHECK: call foo entry: invoke void @foo() @@ -1003,12 +1005,12 @@ invoke.cont2: ; preds = %catch.start ; path back to the loop header), and is placed after the loop latch block ; 'invoke.cont' intentionally. This tests if 'end_loop' marker is placed ; correctly not right after 'invoke.cont' part but after 'ehcleanup' part, -; NOSORT-LABEL: test17: +; NOSORT-LABEL: loop_contains_exception: ; NOSORT: loop ; NOSORT: try ; NOSORT: end_try ; NOSORT: end_loop -define void @test17(i32 %n) personality ptr @__gxx_wasm_personality_v0 { +define void @loop_contains_exception(i32 %n) personality ptr @__gxx_wasm_personality_v0 { entry: br label %while.cond @@ -1052,14 +1054,14 @@ while.end: ; preds = %while.body, %while. ; before its corresponding `catch_all`, because both `try` and `catch_all` body ; should satisfy the return type requirements. -; NOSORT-LABEL: test18: +; NOSORT-LABEL: fix_function_end_return_type_with_try_catch: ; NOSORT: try i32 ; NOSORT: loop i32 ; NOSORT: end_loop ; NOSORT: catch_all ; NOSORT: end_try ; NOSORT-NEXT: end_function -define i32 @test18(i32 %n) personality ptr @__gxx_wasm_personality_v0 { +define i32 @fix_function_end_return_type_with_try_catch(i32 %n) personality ptr @__gxx_wasm_personality_v0 { entry: %t = alloca %class.Object, align 1 br label %for.cond @@ -1097,7 +1099,7 @@ ehcleanup: ; preds = %if.then ; because the initial TRY placement for 'call @quux' was done before 'call @baz' ; because 'call @baz''s return value is stackified. -; CHECK-LABEL: test19: +; CHECK-LABEL: unwind_mismatches_5: ; CHECK: try ; CHECK: try ; CHECK: call $[[RET:[0-9]+]]=, baz @@ -1105,7 +1107,7 @@ ehcleanup: ; preds = %if.then ; CHECK: call quux, $[[RET]] ; CHECK: catch_all ; CHECK: end_try -define void @test19() personality ptr @__gxx_wasm_personality_v0 { +define void @unwind_mismatches_5() personality ptr @__gxx_wasm_personality_v0 { entry: %call = call i32 @baz() invoke void @quux(i32 %call) @@ -1147,10 +1149,10 @@ invoke.cont: ; preds = %entry ; becomes invalid because it incorrectly branches into an inner scope. The ; destination should change to the BB where (b) points. -; NOSORT-LABEL: test20: +; NOSORT-LABEL: branch_remapping_after_fixing_unwind_mismatches_0: ; NOSORT: try ; NOSORT: br_if 0 -define void @test20(i1 %arg) personality ptr @__gxx_wasm_personality_v0 { +define void @branch_remapping_after_fixing_unwind_mismatches_0(i1 %arg) personality ptr @__gxx_wasm_personality_v0 { entry: br i1 %arg, label %bb0, label %dest @@ -1187,8 +1189,8 @@ try.cont: ; preds = %catch.start1, %catc ret void } -; The similar case with test20, but multiple consecutive delegates are -; generated: +; The similar case with branch_remapping_after_fixing_unwind_mismatches_0, but +; multiple consecutive delegates are generated: ; - Before: ; block ; br (a) @@ -1214,7 +1216,7 @@ try.cont: ; preds = %catch.start1, %catc ; <- (b) The br destination should be remapped to here ; ; The test was reduced by bugpoint and should not crash in CFGStackify. -define void @test21() personality ptr @__gxx_wasm_personality_v0 { +define void @branch_remapping_after_fixing_unwind_mismatches_1() personality ptr @__gxx_wasm_personality_v0 { entry: br i1 undef, label %if.then, label %if.end12 @@ -1292,7 +1294,7 @@ unreachable: ; preds = %rethrow19, %invoke. ; Regression test for WasmEHFuncInfo's reverse mapping bug. 'UnwindDestToSrc' ; should return a vector and not a single BB, which was incorrect. ; This was reduced by bugpoint and should not crash in CFGStackify. -define void @test22() personality ptr @__gxx_wasm_personality_v0 { +define void @wasm_eh_func_info_regression_test() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %invoke.cont unwind label %catch.dispatch @@ -1348,7 +1350,7 @@ unreachable: ; preds = %invoke.cont8, %catc unreachable } -; void test23() { +; void exception_grouping_0() { ; try { ; try { ; throw 0; @@ -1364,7 +1366,7 @@ unreachable: ; preds = %invoke.cont8, %catc ; included in catch.start's exception. Also, after we take catch.start2's ; exception out of catch.start's exception, we have to take out try.cont8 out of ; catch.start's exception, because it has a predecessor in catch.start2. -define void @test23() personality ptr @__gxx_wasm_personality_v0 { +define void @exception_grouping_0() personality ptr @__gxx_wasm_personality_v0 { entry: %exception = call ptr @__cxa_allocate_exception(i32 4) #0 store i32 0, ptr %exception, align 16 @@ -1442,7 +1444,7 @@ unreachable: ; preds = %rethrow, %entry ; exception first, before taking out catch.start12's exception out of ; catch.start4's exception; otherwise we end up with an incorrect relationship ; of catch.start's exception > catch.start12's exception. -define void @test24() personality ptr @__gxx_wasm_personality_v0 { +define void @exception_grouping_1() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %invoke.cont unwind label %catch.dispatch @@ -1525,7 +1527,7 @@ unreachable: ; preds = %rethrow, %rethrow6 unreachable } -; void test25() { +; void exception_grouping_2() { ; try { ; try { ; throw 0; @@ -1545,7 +1547,7 @@ unreachable: ; preds = %rethrow, %rethrow6 ; contained in (a)'s exception. Because (a)'s unwind destination is (b), (b)'s ; exception is taken out of (a)'s. But because (c) is reachable from (b), we ; should make sure to take out (c)'s exception out of (a)'s exception too. -define void @test25() personality ptr @__gxx_wasm_personality_v0 { +define void @exception_grouping_2() personality ptr @__gxx_wasm_personality_v0 { entry: %exception = call ptr @__cxa_allocate_exception(i32 4) #1 store i32 0, ptr %exception, align 16 diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll index 3537baa425164..aa191209516f6 100644 --- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll +++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll @@ -10,10 +10,10 @@ target triple = "wasm32-unknown-unknown" ; CHECK: .tagtype __cpp_exception i32 -; CHECK-LABEL: test_throw: +; CHECK-LABEL: throw: ; CHECK: throw __cpp_exception, $0 ; CHECK-NOT: unreachable -define void @test_throw(ptr %p) { +define void @throw(ptr %p) { call void @llvm.wasm.throw(i32 0, ptr %p) ret void } @@ -21,14 +21,14 @@ define void @test_throw(ptr %p) { ; Simple test with a try-catch ; ; void foo(); -; void test_catch() { +; void catch() { ; try { ; foo(); ; } catch (int) { ; } ; } -; CHECK-LABEL: test_catch: +; CHECK-LABEL: catch: ; CHECK: global.get ${{.+}}=, __stack_pointer ; CHECK: try ; CHECK: call foo @@ -44,7 +44,7 @@ define void @test_throw(ptr %p) { ; CHECK: end_block ; CHECK: rethrow 0 ; CHECK: end_try -define void @test_catch() personality ptr @__gxx_wasm_personality_v0 { +define void @catch() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -79,12 +79,12 @@ try.cont: ; preds = %catch, %entry ; struct Temp { ; ~Temp() {} ; }; -; void test_cleanup() { +; void cleanup() { ; Temp t; ; foo(); ; } -; CHECK-LABEL: test_cleanup: +; CHECK-LABEL: cleanup: ; CHECK: try ; CHECK: call foo ; CHECK: catch_all @@ -92,7 +92,7 @@ try.cont: ; preds = %catch, %entry ; CHECK: call $drop=, _ZN4TempD2Ev ; CHECK: rethrow 0 ; CHECK: end_try -define void @test_cleanup() personality ptr @__gxx_wasm_personality_v0 { +define void @cleanup() personality ptr @__gxx_wasm_personality_v0 { entry: %t = alloca %struct.Temp, align 1 invoke void @foo() @@ -112,7 +112,7 @@ ehcleanup: ; preds = %entry ; temrinatepad, because __cxa_end_catch() also can throw within 'catch (...)'. ; ; void foo(); -; void test_terminatepad() { +; void terminatepad() { ; try { ; foo(); ; } catch (...) { @@ -120,7 +120,7 @@ ehcleanup: ; preds = %entry ; } ; } -; CHECK-LABEL: test_terminatepad +; CHECK-LABEL: terminatepad ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -138,7 +138,7 @@ ehcleanup: ; preds = %entry ; CHECK: end_try ; CHECK: call __cxa_end_catch ; CHECK: end_try -define void @test_terminatepad() personality ptr @__gxx_wasm_personality_v0 { +define void @terminatepad() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -182,7 +182,7 @@ terminate: ; preds = %ehcleanup ; instructions after a catch instruction. ; ; void bar(int) noexcept; -; void test_no_prolog_epilog_in_ehpad() { +; void no_prolog_epilog_in_ehpad() { ; int stack_var = 0; ; bar(stack_var); ; try { @@ -192,7 +192,7 @@ terminate: ; preds = %ehcleanup ; } ; } -; CHECK-LABEL: test_no_prolog_epilog_in_ehpad +; CHECK-LABEL: no_prolog_epilog_in_ehpad ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -217,7 +217,7 @@ terminate: ; preds = %ehcleanup ; CHECK-NOT: global.set __stack_pointer, $pop{{.+}} ; CHECK: call __cxa_end_catch ; CHECK: end_try -define void @test_no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 { +define void @no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 { entry: %stack_var = alloca i32, align 4 call void @bar(ptr %stack_var) @@ -262,14 +262,14 @@ ehcleanup: ; preds = %catch ; store SP back to __stack_pointer global at the epilog. ; ; void foo(); -; void test_no_sp_writeback() { +; void no_sp_writeback() { ; try { ; foo(); ; } catch (...) { ; } ; } -; CHECK-LABEL: test_no_sp_writeback +; CHECK-LABEL: no_sp_writeback ; CHECK: try ; CHECK: call foo ; CHECK: catch @@ -278,7 +278,7 @@ ehcleanup: ; preds = %catch ; CHECK: end_try ; CHECK-NOT: global.set __stack_pointer ; CHECK: return -define void @test_no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 { +define void @no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -300,7 +300,7 @@ try.cont: ; preds = %catch.start, %entry ; When the result of @llvm.wasm.get.exception is not used. This is created to ; fix a bug in LateEHPrepare and this should not crash. -define void @test_get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 { +define void @get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch @@ -320,7 +320,7 @@ try.cont: ; preds = %catch.start, %entry ; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another ; catchpad -define void @test_complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 { +define void @complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %invoke.cont unwind label %ehcleanup @@ -352,7 +352,7 @@ ehcleanupret: ; preds = %catch.start, %ehcle ; Regression test for the bug that 'rethrow' was not treated correctly as a ; terminator in isel. -define void @test_rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 { +define void @rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 { entry: invoke void @foo() to label %try.cont unwind label %catch.dispatch From 9fef09fd2918e7d8c357b98a9a798fe207941f73 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 4 Sep 2024 12:19:46 +0800 Subject: [PATCH 028/425] [Clang][CodeGen] Fix type for atomic float incdec operators (#107075) `llvm::ConstantFP::get(llvm::LLVMContext&, APFloat(float))` always returns a f32 constant. Fix https://github.com/llvm/llvm-project/issues/107054. --- clang/lib/CodeGen/CGExprScalar.cpp | 25 +- .../test/CodeGen/AMDGPU/amdgpu-atomic-float.c | 112 +++---- clang/test/CodeGen/X86/x86-atomic-double.c | 88 +++--- .../test/CodeGen/X86/x86-atomic-long_double.c | 293 ++++++++++++++---- 4 files changed, 339 insertions(+), 179 deletions(-) diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index af11bc20a3b63..7aa2d3d89c293 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2865,19 +2865,22 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, llvm::AtomicOrdering::SequentiallyConsistent); return isPre ? Builder.CreateBinOp(op, old, amt) : old; } - // Special case for atomic increment/decrement on floats + // Special case for atomic increment/decrement on floats. + // Bail out non-power-of-2-sized floating point types (e.g., x86_fp80). if (type->isFloatingType()) { - llvm::AtomicRMWInst::BinOp aop = - isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; - llvm::Instruction::BinaryOps op = - isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; - llvm::Value *amt = llvm::ConstantFP::get( - VMContext, llvm::APFloat(static_cast(1.0))); - llvm::AtomicRMWInst *old = - CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt, - llvm::AtomicOrdering::SequentiallyConsistent); + llvm::Type *Ty = ConvertType(type); + if (llvm::has_single_bit(Ty->getScalarSizeInBits())) { + llvm::AtomicRMWInst::BinOp aop = + isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; + llvm::Instruction::BinaryOps op = + isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; + llvm::Value *amt = llvm::ConstantFP::get(Ty, 1.0); + llvm::AtomicRMWInst *old = + CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt, + llvm::AtomicOrdering::SequentiallyConsistent); - return isPre ? Builder.CreateBinOp(op, old, amt) : old; + return isPre ? Builder.CreateBinOp(op, old, amt) : old; + } } value = EmitLoadOfLValue(LV, E->getExprLoc()); input = value; diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c index 6deff1116e1d8..a8fb989b64de5 100644 --- a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c +++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c @@ -99,20 +99,16 @@ float test_float_pre_inc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8 -// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: ret double [[TMP1]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), double 1.000000e+00 seq_cst, align 8 +// SAFE-NEXT: ret double [[TMP0]] // // UNSAFE-LABEL: define dso_local double @test_double_post_inc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: ret double [[TMP1]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: ret double [[TMP0]] // double test_double_post_inc() { @@ -125,20 +121,16 @@ double test_double_post_inc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8 -// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: ret double [[TMP1]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), double 1.000000e+00 seq_cst, align 8 +// SAFE-NEXT: ret double [[TMP0]] // // UNSAFE-LABEL: define dso_local double @test_double_post_dc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] -// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: ret double [[TMP1]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: ret double [[TMP0]] // double test_double_post_dc() { @@ -151,22 +143,18 @@ double test_double_post_dc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8 -// SAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: ret double [[TMP2]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), double 1.000000e+00 seq_cst, align 8 +// SAFE-NEXT: [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00 +// SAFE-NEXT: ret double [[TMP1]] // // UNSAFE-LABEL: define dso_local double @test_double_pre_dc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] -// UNSAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: ret double [[TMP2]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00 +// UNSAFE-NEXT: ret double [[TMP1]] // double test_double_pre_dc() { @@ -179,22 +167,18 @@ double test_double_pre_dc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8 -// SAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// SAFE-NEXT: ret double [[TMP2]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), double 1.000000e+00 seq_cst, align 8 +// SAFE-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// SAFE-NEXT: ret double [[TMP1]] // // UNSAFE-LABEL: define dso_local double @test_double_pre_inc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// UNSAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8 -// UNSAFE-NEXT: ret double [[TMP2]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// UNSAFE-NEXT: ret double [[TMP1]] // double test_double_pre_inc() { @@ -207,20 +191,16 @@ double test_double_pre_inc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2 -// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: ret half [[TMP1]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), half 0xH3C00 seq_cst, align 2 +// SAFE-NEXT: ret half [[TMP0]] // // UNSAFE-LABEL: define dso_local half @test__Float16_post_inc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: ret half [[TMP1]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: ret half [[TMP0]] // _Float16 test__Float16_post_inc() { @@ -233,20 +213,16 @@ _Float16 test__Float16_post_inc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2 -// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: ret half [[TMP1]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), half 0xH3C00 seq_cst, align 2 +// SAFE-NEXT: ret half [[TMP0]] // // UNSAFE-LABEL: define dso_local half @test__Float16_post_dc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] -// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: ret half [[TMP1]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: ret half [[TMP0]] // _Float16 test__Float16_post_dc() { @@ -259,22 +235,18 @@ _Float16 test__Float16_post_dc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2 -// SAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: ret half [[TMP2]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), half 0xH3C00 seq_cst, align 2 +// SAFE-NEXT: [[TMP1:%.*]] = fsub half [[TMP0]], 0xH3C00 +// SAFE-NEXT: ret half [[TMP1]] // // UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] -// UNSAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: ret half [[TMP2]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: [[TMP1:%.*]] = fsub half [[TMP0]], 0xH3C00 +// UNSAFE-NEXT: ret half [[TMP1]] // _Float16 test__Float16_pre_dc() { @@ -287,22 +259,18 @@ _Float16 test__Float16_pre_dc() // SAFE-NEXT: [[ENTRY:.*:]] // SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2 -// SAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// SAFE-NEXT: ret half [[TMP2]] +// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), half 0xH3C00 seq_cst, align 2 +// SAFE-NEXT: [[TMP1:%.*]] = fadd half [[TMP0]], 0xH3C00 +// SAFE-NEXT: ret half [[TMP1]] // // UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc( // UNSAFE-SAME: ) #[[ATTR0]] { // UNSAFE-NEXT: [[ENTRY:.*:]] // UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5) // UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// UNSAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2 -// UNSAFE-NEXT: ret half [[TMP2]] +// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]] +// UNSAFE-NEXT: [[TMP1:%.*]] = fadd half [[TMP0]], 0xH3C00 +// UNSAFE-NEXT: ret half [[TMP1]] // _Float16 test__Float16_pre_inc() { diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c index 2354c89cc2b17..09c8f70c3db85 100644 --- a/clang/test/CodeGen/X86/x86-atomic-double.c +++ b/clang/test/CodeGen/X86/x86-atomic-double.c @@ -6,20 +6,14 @@ // X64-LABEL: define dso_local double @test_double_post_inc( // X64-SAME: ) #[[ATTR0:[0-9]+]] { // X64-NEXT: entry: -// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8 -// X64-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT: ret double [[TMP1]] +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.000000e+00 seq_cst, align 8 +// X64-NEXT: ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_double_post_inc( // X86-SAME: ) #[[ATTR0:[0-9]+]] { // X86-NEXT: entry: -// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8 -// X86-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT: ret double [[TMP1]] +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.000000e+00 seq_cst, align 8 +// X86-NEXT: ret double [[TMP0]] // double test_double_post_inc() { @@ -30,20 +24,14 @@ double test_double_post_inc() // X64-LABEL: define dso_local double @test_double_post_dc( // X64-SAME: ) #[[ATTR0]] { // X64-NEXT: entry: -// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8 -// X64-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT: ret double [[TMP1]] +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.000000e+00 seq_cst, align 8 +// X64-NEXT: ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_double_post_dc( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: entry: -// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8 -// X86-NEXT: store float [[TMP0]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT: ret double [[TMP1]] +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.000000e+00 seq_cst, align 8 +// X86-NEXT: ret double [[TMP0]] // double test_double_post_dc() { @@ -54,22 +42,16 @@ double test_double_post_dc() // X64-LABEL: define dso_local double @test_double_pre_dc( // X64-SAME: ) #[[ATTR0]] { // X64-NEXT: entry: -// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8 -// X64-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 8 -// X64-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT: ret double [[TMP2]] +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, double 1.000000e+00 seq_cst, align 8 +// X64-NEXT: [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00 +// X64-NEXT: ret double [[TMP1]] // // X86-LABEL: define dso_local double @test_double_pre_dc( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: entry: -// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8 -// X86-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00 -// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT: ret double [[TMP2]] +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, double 1.000000e+00 seq_cst, align 8 +// X86-NEXT: [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00 +// X86-NEXT: ret double [[TMP1]] // double test_double_pre_dc() { @@ -80,25 +62,43 @@ double test_double_pre_dc() // X64-LABEL: define dso_local double @test_double_pre_inc( // X64-SAME: ) #[[ATTR0]] { // X64-NEXT: entry: -// X64-NEXT: [[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8 -// X64-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 8 -// X64-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT: ret double [[TMP2]] +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, double 1.000000e+00 seq_cst, align 8 +// X64-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// X64-NEXT: ret double [[TMP1]] // // X86-LABEL: define dso_local double @test_double_pre_inc( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: entry: -// X86-NEXT: [[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8 -// X86-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00 -// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT: ret double [[TMP2]] +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, double 1.000000e+00 seq_cst, align 8 +// X86-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// X86-NEXT: ret double [[TMP1]] // double test_double_pre_inc() { static _Atomic double n; return ++n; } + +// X64-LABEL: define dso_local i32 @pr107054( +// X64-SAME: ) #[[ATTR0]] { +// X64-NEXT: entry: +// X64-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @pr107054.n, double 1.000000e+00 seq_cst, align 8 +// X64-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// X64-NEXT: [[CMP:%.*]] = fcmp oeq double [[TMP1]], 1.000000e+00 +// X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +// X64-NEXT: ret i32 [[CONV]] +// +// X86-LABEL: define dso_local i32 @pr107054( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr @pr107054.n, double 1.000000e+00 seq_cst, align 8 +// X86-NEXT: [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// X86-NEXT: [[CMP:%.*]] = fcmp oeq double [[TMP1]], 1.000000e+00 +// X86-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +// X86-NEXT: ret i32 [[CONV]] +// +int pr107054() +{ + static _Atomic double n; + return (++n) == 1; +} diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c index 2c3f381f13511..9c82784807dac 100644 --- a/clang/test/CodeGen/X86/x86-atomic-long_double.c +++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c @@ -4,29 +4,60 @@ // X64-LABEL: define dso_local x86_fp80 @testinc( // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { -// X64-NEXT: [[ENTRY:.*:]] -// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ENTRY:.*]]: // X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 // X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 // X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// X64-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// X64-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// X64-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 -// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// X64-NEXT: ret x86_fp80 [[TMP3]] +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: ret x86_fp80 [[INC]] // // X86-LABEL: define dso_local x86_fp80 @testinc( // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ENTRY:.*]]: // X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 // X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// X86-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// X86-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// X86-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// X86-NEXT: ret x86_fp80 [[TMP3]] +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: ret x86_fp80 [[INC]] // long double testinc(_Atomic long double *addr) { @@ -35,27 +66,60 @@ long double testinc(_Atomic long double *addr) { // X64-LABEL: define dso_local x86_fp80 @testdec( // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X64-NEXT: [[ENTRY:.*:]] -// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ENTRY:.*]]: // X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 // X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 // X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// X64-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 -// X64-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// X64-NEXT: ret x86_fp80 [[TMP2]] +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: ret x86_fp80 [[TMP1]] // // X86-LABEL: define dso_local x86_fp80 @testdec( // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ENTRY:.*]]: // X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 // X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// X86-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// X86-NEXT: ret x86_fp80 [[TMP2]] +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: ret x86_fp80 [[TMP1]] // long double testdec(_Atomic long double *addr) { @@ -175,29 +239,60 @@ long double testassign(_Atomic long double *addr) { // X64-LABEL: define dso_local x86_fp80 @test_volatile_inc( // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X64-NEXT: [[ENTRY:.*:]] -// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ENTRY:.*]]: // X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 // X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 // X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// X64-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// X64-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// X64-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 16 -// X64-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// X64-NEXT: ret x86_fp80 [[TMP3]] +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: ret x86_fp80 [[INC]] // // X86-LABEL: define dso_local x86_fp80 @test_volatile_inc( // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ENTRY:.*]]: // X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 // X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// X86-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// X86-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00 -// X86-NEXT: store float [[TMP2]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// X86-NEXT: ret x86_fp80 [[TMP3]] +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: ret x86_fp80 [[INC]] // long double test_volatile_inc(volatile _Atomic long double *addr) { return ++*addr; @@ -205,27 +300,60 @@ long double test_volatile_inc(volatile _Atomic long double *addr) { // X64-LABEL: define dso_local x86_fp80 @test_volatile_dec( // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X64-NEXT: [[ENTRY:.*:]] -// X64-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ENTRY:.*]]: // X64-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 // X64-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8 // X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8 -// X64-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16 -// X64-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 16 -// X64-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16 -// X64-NEXT: ret x86_fp80 [[TMP2]] +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0 +// X64-NEXT: [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +// X64-NEXT: store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: ret x86_fp80 [[TMP1]] // // X86-LABEL: define dso_local x86_fp80 @test_volatile_dec( // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[RETVAL:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ENTRY:.*]]: // X86-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 // X86-NEXT: store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4 -// X86-NEXT: [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4 -// X86-NEXT: store float [[TMP1]], ptr [[RETVAL]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4 -// X86-NEXT: ret x86_fp80 [[TMP2]] +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: ret x86_fp80 [[TMP1]] // long double test_volatile_dec(volatile _Atomic long double *addr) { return (*addr)--; @@ -341,3 +469,64 @@ long double test_volatile_assign(volatile _Atomic long double *addr) { return *addr; } + +// X64-LABEL: define dso_local i32 @pr107054( +// X64-SAME: ) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*]]: +// X64-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16 +// X64-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr @pr107054.n seq_cst, align 16 +// X64-NEXT: store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: [[TMP0:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16 +// X64-NEXT: br label %[[ATOMIC_OP:.*]] +// X64: [[ATOMIC_OP]]: +// X64-NEXT: [[TMP1:%.*]] = phi x86_fp80 [ [[TMP0]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[ATOMIC_OP]] ] +// X64-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP1]], 0xK3FFF8000000000000000 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[TMP1]], ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: [[TMP2:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16 +// X64-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false) +// X64-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16 +// X64-NEXT: [[TMP4:%.*]] = cmpxchg ptr @pr107054.n, i128 [[TMP2]], i128 [[TMP3]] seq_cst seq_cst, align 16 +// X64-NEXT: [[TMP5:%.*]] = extractvalue { i128, i1 } [[TMP4]], 0 +// X64-NEXT: [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1 +// X64-NEXT: store i128 [[TMP5]], ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: [[TMP7]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16 +// X64-NEXT: br i1 [[TMP6]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X64: [[ATOMIC_CONT]]: +// X64-NEXT: [[CMP:%.*]] = fcmp oeq x86_fp80 [[INC]], 0xK3FFF8000000000000000 +// X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +// X64-NEXT: ret i32 [[CONV]] +// +// X86-LABEL: define dso_local i32 @pr107054( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*]]: +// X86-NEXT: [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4 +// X86-NEXT: call void @__atomic_load(i32 noundef 12, ptr noundef @pr107054.n, ptr noundef [[ATOMIC_TEMP]], i32 noundef 5) +// X86-NEXT: [[TMP0:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4 +// X86-NEXT: br label %[[ATOMIC_OP:.*]] +// X86: [[ATOMIC_OP]]: +// X86-NEXT: [[TMP1:%.*]] = phi x86_fp80 [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[ATOMIC_OP]] ] +// X86-NEXT: [[INC:%.*]] = fadd x86_fp80 [[TMP1]], 0xK3FFF8000000000000000 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[TMP1]], ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false) +// X86-NEXT: store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4 +// X86-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef @pr107054.n, ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5) +// X86-NEXT: [[TMP2]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4 +// X86-NEXT: br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]] +// X86: [[ATOMIC_CONT]]: +// X86-NEXT: [[CMP:%.*]] = fcmp oeq x86_fp80 [[INC]], 0xK3FFF8000000000000000 +// X86-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +// X86-NEXT: ret i32 [[CONV]] +// +int pr107054() +{ + static _Atomic long double n; + return (++n) == 1; +} From 6c607cfb2c2d8acd2b92d7ed8106ab1e4fc0d79d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 4 Sep 2024 12:57:09 +0800 Subject: [PATCH 029/425] [RISCV] Preserve tail agnostic policy in foldVMV_V_V (#105788) This patch helps avoid regressions in an upcoming patch by making sure we don't accidentally lose a tail agnostic policy when folding a vmv.v.v into its source. The previous comment about RISCVInsertVSETVLI relaxing the policy didn't take into account the fact that there's a policy operand on vmv.v.v, which can be tail agnostic. If the tail is agnostic (via either the policy operand or the passthru being undef) and vmv.v.v's VL <= Src's VL, then Src's tail can be made agnostic. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 11 +++-- .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 42 +++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 412fd790061a3..35c3bc9708d91 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -529,10 +529,13 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { *Src->getParent()->getParent())); } - // Use a conservative tu,mu policy, RISCVInsertVSETVLI will relax it if - // passthru is undef. - Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())) - .setImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED); + // If MI was tail agnostic and the VL didn't increase, preserve it. + int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED; + bool TailAgnostic = (MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) || + Passthru.getReg() == RISCV::NoRegister; + if (TailAgnostic && isVLKnownLE(MI.getOperand(3), SrcVL)) + Policy |= RISCVII::TAIL_AGNOSTIC; + Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); MRI->replaceRegWith(MI.getOperand(0).getReg(), Src->getOperand(0).getReg()); MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir index b2526c6df6939..771b2073370e6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir @@ -18,3 +18,45 @@ body: | %y:gpr = ADDI $x0, 1 %z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */ ... +--- +name: tail_agnostic +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: tail_agnostic + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */ + %passthru:vr = COPY $v8 + %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %y:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 1 /* ta, mu */ +... +--- +name: tail_agnostic_larger_vl +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: tail_agnostic_larger_vl + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %passthru:vr = COPY $v8 + %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %y:vr = PseudoVMV_V_V_M1 %passthru, %x, 5, 5 /* e32 */, 1 /* ta, mu */ +... +--- +name: undef_passthru_src_undef_passthru +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: undef_passthru_src_undef_passthru + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */ + %passthru:vr = COPY $v8 + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 0 /* tu, mu */ +... From c94bd96c277e0b48e198fdc831bb576d9a04aced Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 4 Sep 2024 13:36:32 +0800 Subject: [PATCH 030/425] [Clang][CodeGen] Don't emit assumptions if current block is unreachable. (#106936) Fixes https://github.com/llvm/llvm-project/issues/106898. When emitting an infinite loop, clang codegen will delete the whole block and leave builder's current block as nullptr: https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/clang/lib/CodeGen/CGStmt.cpp#L597-L600 Then clang will create `zext (icmp slt %a, %b)` without parent block for `a < b`. It will crash here: https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/clang/lib/CodeGen/CGExprScalar.cpp#L416-L420 Even if we disabled this optimization, it still crashes in `Builder.CreateAssumption`: https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/llvm/lib/IR/IRBuilder.cpp#L551-L561 This patch disables assumptions emission if current block is null. --- clang/lib/CodeGen/CGStmt.cpp | 2 +- clang/test/SemaCXX/cxx23-assume.cpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 7158a06e6bc3b..b138c87a85349 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -752,7 +752,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { } break; case attr::CXXAssume: { const Expr *Assumption = cast(A)->getAssumption(); - if (getLangOpts().CXXAssumptions && + if (getLangOpts().CXXAssumptions && Builder.GetInsertBlock() && !Assumption->HasSideEffects(getContext())) { llvm::Value *AssumptionVal = EvaluateExprAsBool(Assumption); Builder.CreateAssumption(AssumptionVal); diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp index 9138501d726dd..eeae59daea3f7 100644 --- a/clang/test/SemaCXX/cxx23-assume.cpp +++ b/clang/test/SemaCXX/cxx23-assume.cpp @@ -158,3 +158,12 @@ foo (int x, int y) return x + y; } } + +// Do not crash when assumptions are unreachable. +namespace gh106898 { +int foo () { + while(1); + int a = 0, b = 1; + __attribute__((assume (a < b))); +} +} From 3e798476de466e8a051d3e753db379731a8d9705 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 3 Sep 2024 22:44:49 -0700 Subject: [PATCH 031/425] [LegalizeDAG][RISCV] Don't promote f16 vector ISD::FNEG/FABS/FCOPYSIGN to f32 when we don't have Zvfh. (#106652) The fp_extend will canonicalize NaNs which is not the semantics of FNEG/FABS/FCOPYSIGN. For fixed vectors I'm scalarizing due to test changes on other targets where the scalarization is expected. I will try to address in a follow up. For scalable vectors, we bitcast to integer and use integer logic ops. --- .../SelectionDAG/LegalizeVectorOps.cpp | 63 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 20 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 3279 +++++++++++++++-- llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll | 65 +- .../CodeGen/RISCV/rvv/vfcopysign-sdnode.ll | 647 ++-- .../RISCV/rvv/vfmsub-constrained-sdnode.ll | 275 +- llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll | 59 +- .../RISCV/rvv/vfnmadd-constrained-sdnode.ll | 424 +-- .../RISCV/rvv/vfnmsub-constrained-sdnode.ll | 341 +- 9 files changed, 3691 insertions(+), 1482 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 297c349ae4e2f..29dae4e27c768 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -142,6 +142,8 @@ class VectorLegalizer { std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); SDValue ExpandFNEG(SDNode *Node); + SDValue ExpandFABS(SDNode *Node); + SDValue ExpandFCOPYSIGN(SDNode *Node); void ExpandFSUB(SDNode *Node, SmallVectorImpl &Results); void ExpandSETCC(SDNode *Node, SmallVectorImpl &Results); void ExpandBITREVERSE(SDNode *Node, SmallVectorImpl &Results); @@ -942,6 +944,18 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; } break; + case ISD::FABS: + if (SDValue Expanded = ExpandFABS(Node)) { + Results.push_back(Expanded); + return; + } + break; + case ISD::FCOPYSIGN: + if (SDValue Expanded = ExpandFCOPYSIGN(Node)) { + Results.push_back(Expanded); + return; + } + break; case ISD::FSUB: ExpandFSUB(Node, Results); return; @@ -1781,7 +1795,7 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) { // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) || - !TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) + !(TLI.isOperationLegalOrCustom(ISD::FSUB, VT) || VT.isScalableVector())) return SDValue(); SDLoc DL(Node); @@ -1792,6 +1806,53 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, Xor); } +SDValue VectorLegalizer::ExpandFABS(SDNode *Node) { + EVT VT = Node->getValueType(0); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + + // FIXME: We shouldn't restrict this to scalable vectors. + if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || !VT.isScalableVector()) + return SDValue(); + + SDLoc DL(Node); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); + SDValue ClearSignMask = DAG.getConstant( + APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, Cast, ClearSignMask); + return DAG.getNode(ISD::BITCAST, DL, VT, ClearedSign); +} + +SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) { + EVT VT = Node->getValueType(0); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + + // FIXME: We shouldn't restrict this to scalable vectors. + if (VT != Node->getOperand(1).getValueType() || + !TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || + !TLI.isOperationLegalOrCustom(ISD::OR, IntVT) || !VT.isScalableVector()) + return SDValue(); + + SDLoc DL(Node); + SDValue Mag = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); + SDValue Sign = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(1)); + + SDValue SignMask = DAG.getConstant( + APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, Sign, SignMask); + + SDValue ClearSignMask = DAG.getConstant( + APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, Mag, ClearSignMask); + + SDNodeFlags Flags; + Flags.setDisjoint(true); + + SDValue CopiedSign = + DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit, Flags); + + return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); +} + void VectorLegalizer::ExpandFSUB(SDNode *Node, SmallVectorImpl &Results) { // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3742b897ca568..5089bbbe3c0d7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -934,13 +934,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // TODO: support more ops. static const unsigned ZvfhminPromoteOps[] = { - ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB, - ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, - ISD::FABS, ISD::FNEG, ISD::FCOPYSIGN, ISD::FCEIL, - ISD::FFLOOR, ISD::FROUND, ISD::FROUNDEVEN, ISD::FRINT, - ISD::FNEARBYINT, ISD::IS_FPCLASS, ISD::SETCC, ISD::FMAXIMUM, - ISD::FMINIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, - ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA}; + ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB, + ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, + ISD::FCEIL, ISD::FFLOOR, ISD::FROUND, ISD::FROUNDEVEN, + ISD::FRINT, ISD::FNEARBYINT, ISD::IS_FPCLASS, ISD::SETCC, + ISD::FMAXIMUM, ISD::FMINIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, + ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA}; // TODO: support more vp ops. static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD, @@ -1082,6 +1081,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // load/store setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + // Custom split nxv32f16 since nxv32f32 if not legal. if (VT == MVT::nxv32f16) { setOperationAction(ZvfhminPromoteOps, VT, Custom); @@ -1337,6 +1340,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // available. setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom); } + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); // Don't promote f16 vector operations to f32 if f32 vector type is // not legal. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 1843157573257..56cd718536daa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -516,14 +516,50 @@ define void @fneg_v8f16(ptr %x) { ; ; ZVFHMIN-LABEL: fneg_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vse16.v v8, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: flh fa4, 0(sp) +; ZVFHMIN-NEXT: flh fa3, 4(sp) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-NEXT: lui a3, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-NEXT: xor a4, a4, a3 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = fneg <8 x half> %a @@ -542,35 +578,112 @@ define void @fneg_v6f16(ptr %x) { ; ; ZVFHMIN-RV32-LABEL: fneg_v6f16: ; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV32-NEXT: lui a3, 1048568 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: xor a4, a4, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV32-NEXT: xor a5, a5, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV32-NEXT: xor a6, a6, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: lui t0, 8 +; ZVFHMIN-RV32-NEXT: xor a7, a7, t0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a7 +; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-RV32-NEXT: xor a6, a7, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-RV32-NEXT: xor a3, a6, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fneg_v6f16: ; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV64-NEXT: lui a3, 1048568 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV64-NEXT: lui a5, 8 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV64-NEXT: xor a4, a4, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = fneg <6 x half> %a @@ -623,17 +736,101 @@ define void @fabs_v8f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: fabs_v8f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-RV32-LABEL: fabs_v8f16: +; ZVFHMIN-RV32: # %bb.0: +; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV32-NEXT: lui a3, 8 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV32-NEXT: addi a3, a3, -1 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-RV32-NEXT: ret +; +; ZVFHMIN-RV64-LABEL: fabs_v8f16: +; ZVFHMIN-RV64: # %bb.0: +; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV64-NEXT: lui a3, 8 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV64-NEXT: addiw a3, a3, -1 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-RV64-NEXT: ret %a = load <8 x half>, ptr %x %b = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a) store <8 x half> %b, ptr %x @@ -652,35 +849,112 @@ define void @fabs_v6f16(ptr %x) { ; ; ZVFHMIN-RV32-LABEL: fabs_v6f16: ; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV32-NEXT: lui a3, 8 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV32-NEXT: addi a3, a3, -1 +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: and a5, a5, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a7 +; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-RV32-NEXT: and a6, a7, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-RV32-NEXT: and a3, a6, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fabs_v6f16: ; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV64-NEXT: lui a3, 8 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV64-NEXT: addiw a3, a3, -1 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a) @@ -737,19 +1011,287 @@ define void @copysign_v8f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: copysign_v8f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v8, v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_v8f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: ret +; +; ZVFHMIN-ZFH-RV64-LABEL: copysign_v8f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v8f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui t1, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, t1, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, t0, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, t2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a6, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v8f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui t1, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, t1, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, t0, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, t2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a6, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, t1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y %c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) @@ -768,58 +1310,331 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: copysign_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_v6f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: ret ; -; ZVFHMIN-RV64-LABEL: copysign_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret - %a = load <6 x half>, ptr %x - %b = load <6 x half>, ptr %y - %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b) - store <6 x half> %c, ptr %x - ret void -} -declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>) - -define void @copysign_v4f32(ptr %x, ptr %y) { -; ZVFH-LABEL: copysign_v4f32: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVFH-NEXT: vle32.v v8, (a0) -; ZVFH-NEXT: vle32.v v9, (a1) -; ZVFH-NEXT: vfsgnj.vv v8, v8, v9 -; ZVFH-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-LABEL: copysign_v6f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v6f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui t1, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t1, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t0, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a7, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t2, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t0, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t2, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t1, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, t1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v6f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui t1, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, t1, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, t0, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, t2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a6, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, t1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret + %a = load <6 x half>, ptr %x + %b = load <6 x half>, ptr %y + %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b) + store <6 x half> %c, ptr %x + ret void +} +declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>) + +define void @copysign_v4f32(ptr %x, ptr %y) { +; ZVFH-LABEL: copysign_v4f32: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVFH-NEXT: vle32.v v8, (a0) +; ZVFH-NEXT: vle32.v v9, (a1) +; ZVFH-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFH-NEXT: vse32.v v8, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: copysign_v4f32: @@ -864,20 +1679,215 @@ define void @copysign_vf_v8f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: copysign_vf_v8f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v8f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: ret +; +; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v8f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v8f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, a3, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v8f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, a3, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <8 x half>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer @@ -895,52 +1905,247 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: copysign_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v6f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.h fa5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: ret ; -; ZVFHMIN-RV64-LABEL: copysign_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v6f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.h fa5, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v6f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a5, a2, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, a7, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t0, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, t1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v6f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a4, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a4, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -999,24 +2204,303 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: copysign_neg_v8f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v8, v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v8f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft2, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft3, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft1, ft2, ft1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, ft1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft0, ft3, ft0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, ft0 +; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa0, ft0, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa1, ft1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa2, fa0, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa3, fa1, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: ret +; +; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v8f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft2, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft3, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft1, ft2, ft1 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, ft1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft0, ft3, ft0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, ft0 +; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa0, ft0, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa1, ft1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa2, fa0, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa3, fa1, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v8f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a2, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: not a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a5, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a5) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: not a7, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t0, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui t3, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t3, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a5, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t4, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a7, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a7, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v8f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a2, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: not a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a6, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a5, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a5) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: not a7, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t0, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui t3, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, t3, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, a5, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t2, t2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or t2, t4, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, t4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or t0, t4, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a7, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a6, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a7, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y %c = fneg <8 x half> %b @@ -1035,52 +2519,331 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: copysign_neg_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v6f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft2, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh ft3, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft1, ft2, ft1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, ft1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft0, ft3, ft0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, ft0 +; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa0, ft0, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa1, ft1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa2, fa0, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa3, fa1, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: ret ; -; ZVFHMIN-RV64-LABEL: copysign_neg_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v6f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft2, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh ft3, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft1, ft2, ft1 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, ft1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft0, ft3, ft0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, ft0 +; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa0, ft0, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa1, ft1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa2, fa0, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa3, fa1, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa4, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v6f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a2, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a4, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a4) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: not t3, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a5, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui t0, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, t0, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a7, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a7, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, t4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, t4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and t3, t3, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t3, t4, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t1, t4, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, t4, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a6, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v6f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a2, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: not a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a6, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a5, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a5) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: not a7, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t0, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui t3, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, t3, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, a5, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t2, t2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or t2, t4, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, t4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or t0, t4, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, t0 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a7, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a6, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a7, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fneg <6 x half> %b @@ -1143,25 +2906,187 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: copysign_neg_trunc_v4f16_v4f32: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vle32.v v9, (a1) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: ret +; +; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a2, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a6, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a2, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a6, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <4 x half>, ptr %x %b = load <4 x float>, ptr %y %c = fneg <4 x float> %b @@ -1185,65 +3110,215 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 3, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v8, v10, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: addi a1, sp, 8 -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 24 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsh fa5, 4(a0) +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: ret ; -; ZVFHMIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vle64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle32.v v9, (a1) -; ZVFHMIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v8, v10, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV64-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse32.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vle64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v10, v8 +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v10, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 24 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsh fa5, 4(a0) +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a2, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a6, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 24 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 4(a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vle64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v10, v8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v10, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a4, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a5, a4, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 24 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 4(a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <3 x half>, ptr %x %b = load <3 x float>, ptr %y %c = fneg <3 x float> %b @@ -1543,23 +3618,59 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ; ZVFHMIN-LABEL: fmsub_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-NEXT: vle16.v v8, (a2) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v11 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vse16.v v8, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: flh fa4, 0(sp) +; ZVFHMIN-NEXT: flh fa3, 4(sp) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-NEXT: lui a3, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-NEXT: xor a4, a4, a3 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v11, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: vslidedown.vi v11, v8, 4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmacc.vv v10, v8, v9 +; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -1583,53 +3694,125 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ; ZVFHMIN-RV32-LABEL: fmsub_v6f16: ; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a2) ; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) ; ZVFHMIN-RV32-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v8, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV32-NEXT: lui a3, 1048568 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV32-NEXT: lui a5, 8 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV32-NEXT: xor a4, a4, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV32-NEXT: vmv.v.x v11, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v11, v8, 4, v0.t +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmacc.vv v10, v8, v9 +; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v8 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 ; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 ; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fmsub_v6f16: ; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a2) ; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) ; ZVFHMIN-RV64-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v8, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-RV64-NEXT: lui a3, 1048568 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV64-NEXT: lui a5, 8 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-RV64-NEXT: xor a4, a4, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v11, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 +; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v11, v8, 4, v0.t +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmacc.vv v10, v8, v9 +; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v8 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) ; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y @@ -2018,17 +4201,187 @@ define void @fneg_v16f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: fneg_v16f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vse16.v v10, (a0) -; ZVFHMIN-NEXT: ret +; ZVFHMIN-RV32-LABEL: fneg_v16f16: +; ZVFHMIN-RV32: # %bb.0: +; ZVFHMIN-RV32-NEXT: addi sp, sp, -64 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 64 +; ZVFHMIN-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; ZVFHMIN-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; ZVFHMIN-RV32-NEXT: .cfi_offset ra, -4 +; ZVFHMIN-RV32-NEXT: .cfi_offset s0, -8 +; ZVFHMIN-RV32-NEXT: addi s0, sp, 64 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN-RV32-NEXT: andi sp, sp, -32 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 16, e16, m1, ta, mu +; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-RV32-NEXT: flh fa3, 6(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-RV32-NEXT: flh fa4, 8(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-RV32-NEXT: lui a1, 1048568 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: lui t0, 8 +; ZVFHMIN-RV32-NEXT: xor a3, a3, t0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 18(sp) +; ZVFHMIN-RV32-NEXT: xor a5, a5, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV32-NEXT: xor a4, a6, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 16(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: xor a4, a7, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV32-NEXT: xor a2, a4, t0 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-RV32-NEXT: xor a5, a5, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 26(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: xor a1, a2, a1 +; ZVFHMIN-RV32-NEXT: li a2, 255 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 8, v0.t +; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, s0, -64 +; ZVFHMIN-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; ZVFHMIN-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; ZVFHMIN-RV32-NEXT: addi sp, sp, 64 +; ZVFHMIN-RV32-NEXT: ret +; +; ZVFHMIN-RV64-LABEL: fneg_v16f16: +; ZVFHMIN-RV64: # %bb.0: +; ZVFHMIN-RV64-NEXT: addi sp, sp, -64 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 64 +; ZVFHMIN-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; ZVFHMIN-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; ZVFHMIN-RV64-NEXT: .cfi_offset ra, -8 +; ZVFHMIN-RV64-NEXT: .cfi_offset s0, -16 +; ZVFHMIN-RV64-NEXT: addi s0, sp, 64 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN-RV64-NEXT: andi sp, sp, -32 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 16, e16, m1, ta, mu +; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-RV64-NEXT: flh fa3, 6(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-RV64-NEXT: flh fa4, 8(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV64-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-RV64-NEXT: lui a1, 1048568 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: lui t0, 8 +; ZVFHMIN-RV64-NEXT: xor a3, a3, t0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-RV64-NEXT: xor a5, a5, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-RV64-NEXT: xor a4, a6, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 16(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: xor a4, a7, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV64-NEXT: xor a2, a4, t0 +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-RV64-NEXT: xor a5, a5, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 26(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: xor a1, a2, a1 +; ZVFHMIN-RV64-NEXT: li a2, 255 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 8, v0.t +; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, s0, -64 +; ZVFHMIN-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; ZVFHMIN-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; ZVFHMIN-RV64-NEXT: addi sp, sp, 64 +; ZVFHMIN-RV64-NEXT: ret %a = load <16 x half>, ptr %x %b = fneg <16 x half> %a store <16 x half> %b, ptr %x @@ -3554,24 +5907,60 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fmsub_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vse16.v v9, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: flh fa4, 0(sp) ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 4(sp) +; ZVFHMIN-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: lui a1, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a3, a3, a5 +; ZVFHMIN-NEXT: vmv.v.x v10, a3 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a2, a2, a1 +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a2 +; ZVFHMIN-NEXT: xor a4, a4, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a4 +; ZVFHMIN-NEXT: xor a3, a3, a1 +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a3 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a2, a2, a1 +; ZVFHMIN-NEXT: xor a3, a3, a5 +; ZVFHMIN-NEXT: vmv.v.x v11, a3 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-NEXT: xor a3, a3, a1 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a3 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: xor a1, a2, a1 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: vslidedown.vi v11, v10, 4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v8, v11 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmacc.vv v10, v8, v9 +; ZVFHMIN-NEXT: vfmadd.vv v8, v11, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 -; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -3595,63 +5984,135 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-RV32-LABEL: fmsub_vf_v6f16: ; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vse16.v v9, (a1) +; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: li a4, 192 +; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a3, v0 +; ZVFHMIN-RV32-NEXT: lui a1, 1048568 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV32-NEXT: lui a5, 8 +; ZVFHMIN-RV32-NEXT: xor a4, a4, a5 +; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a2 +; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a3 +; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a4 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV32-NEXT: xor a3, a3, a5 +; ZVFHMIN-RV32-NEXT: vmv.v.x v11, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a3 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: xor a1, a2, a1 +; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v11, v10, 4, v0.t +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfneg.v v9, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v11, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmacc.vv v8, v9, v10 +; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 ; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) ; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV32-NEXT: ret ; ; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16: ; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vse16.v v9, (a1) +; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: li a4, 192 +; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a4 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a3, v0 +; ZVFHMIN-RV64-NEXT: lui a1, 1048568 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-RV64-NEXT: lui a5, 8 +; ZVFHMIN-RV64-NEXT: xor a4, a4, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a4 +; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a2 +; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a3 +; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a4 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 +; ZVFHMIN-RV64-NEXT: xor a3, a3, a5 +; ZVFHMIN-RV64-NEXT: vmv.v.x v11, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a3 +; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV64-NEXT: xor a1, a2, a1 +; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-RV64-NEXT: vslidedown.vi v11, v10, 4, v0.t +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v11 ; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfneg.v v9, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v11, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmacc.vv v8, v9, v10 +; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 ; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 ; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll index 95a410ea56b74..4bf9ae16cdaf0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll @@ -19,12 +19,10 @@ define @vfabs_nxv1f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv1f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv1f16( %v) ret %r @@ -41,12 +39,10 @@ define @vfabs_nxv2f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv2f16( %v) ret %r @@ -63,12 +59,10 @@ define @vfabs_nxv4f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv4f16( %v) ret %r @@ -85,12 +79,10 @@ define @vfabs_nxv8f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfabs.v v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv8f16( %v) ret %r @@ -107,12 +99,10 @@ define @vfabs_nxv16f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv16f16( %v) ret %r @@ -129,17 +119,10 @@ define @vfabs_nxv32f16( %v) { ; ; ZVFHMIN-LABEL: vfabs_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %r = call @llvm.fabs.nxv32f16( %v) ret %r diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll index 029a121d08980..c71c07488581a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll @@ -19,13 +19,12 @@ define @vfcopysign_vv_nxv1f16( %vm, @llvm.copysign.nxv1f16( %vm, %vs) ret %r @@ -45,12 +44,11 @@ define @vfcopysign_vf_nxv1f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v10, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -67,18 +65,13 @@ define @vfcopynsign_vv_nxv1f16( %vm, %vs %r = call @llvm.copysign.nxv1f16( %vm, %n) @@ -99,17 +92,12 @@ define @vfcopynsign_vf_nxv1f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vxor.vx v9, v10, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -130,12 +118,11 @@ define @vfcopysign_exttrunc_vv_nxv1f16_nxv1f32( %vs to %r = call @llvm.copysign.nxv1f16( %vm, %e) @@ -158,12 +145,11 @@ define @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32( poison, float %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -182,19 +168,14 @@ define @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32( %vs %eneg = fptrunc %n to @@ -216,19 +197,14 @@ define @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32( poison, float %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -254,12 +230,11 @@ define @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64( %vs to %r = call @llvm.copysign.nxv1f16( %vm, %e) @@ -286,12 +261,11 @@ define @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64( poison, double %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -312,22 +286,17 @@ define @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64( %vs %eneg = fptrunc %n to @@ -351,22 +320,17 @@ define @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64( poison, double %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -387,13 +351,12 @@ define @vfcopysign_vv_nxv2f16( %vm, @llvm.copysign.nxv2f16( %vm, %vs) ret %r @@ -413,12 +376,11 @@ define @vfcopysign_vf_nxv2f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v10, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -435,18 +397,13 @@ define @vfcopynsign_vv_nxv2f16( %vm, %vs %r = call @llvm.copysign.nxv2f16( %vm, %n) @@ -467,17 +424,12 @@ define @vfcopynsign_vf_nxv2f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v9, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vxor.vx v9, v10, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -497,13 +449,12 @@ define @vfcopysign_vv_nxv4f16( %vm, @llvm.copysign.nxv4f16( %vm, %vs) ret %r @@ -523,12 +474,11 @@ define @vfcopysign_vf_nxv4f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v10, v10, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -545,18 +495,13 @@ define @vfcopynsign_vv_nxv4f16( %vm, %vs %r = call @llvm.copysign.nxv4f16( %vm, %n) @@ -577,17 +522,12 @@ define @vfcopynsign_vf_nxv4f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v10, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v10, v10, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -607,13 +547,12 @@ define @vfcopysign_vv_nxv8f16( %vm, @llvm.copysign.nxv8f16( %vm, %vs) ret %r @@ -633,12 +572,11 @@ define @vfcopysign_vf_nxv8f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v12, v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v10, v10, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -655,18 +593,13 @@ define @vfcopynsign_vv_nxv8f16( %vm, %vs %r = call @llvm.copysign.nxv8f16( %vm, %n) @@ -687,17 +620,12 @@ define @vfcopynsign_vf_nxv8f16( %vm, half ; ZVFHMIN-NEXT: vfmv.v.f v12, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfneg.v v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v12, v12, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vxor.vx v10, v10, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v10, v10, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -718,12 +646,11 @@ define @vfcopysign_exttrunc_vv_nxv8f16_nxv8f32( %vs to %r = call @llvm.copysign.nxv8f16( %vm, %e) @@ -746,12 +673,11 @@ define @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32( poison, float %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -770,19 +696,14 @@ define @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32( %vs %eneg = fptrunc %n to @@ -804,19 +725,14 @@ define @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32( poison, float %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -842,12 +758,11 @@ define @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64( %vs to %r = call @llvm.copysign.nxv8f16( %vm, %e) @@ -874,12 +789,11 @@ define @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64( poison, double %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -900,22 +814,17 @@ define @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64( %vs %eneg = fptrunc %n to @@ -939,22 +848,17 @@ define @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64( poison, double %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -975,13 +879,12 @@ define @vfcopysign_vv_nxv16f16( %vm, @llvm.copysign.nxv16f16( %vm, %vs) ret %r @@ -1001,12 +904,11 @@ define @vfcopysign_vf_nxv16f16( %vm, ha ; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1023,18 +925,13 @@ define @vfcopynsign_vv_nxv16f16( %vm, < ; ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v12, v12, a0 +; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %n = fneg %vs %r = call @llvm.copysign.nxv16f16( %vm, %n) @@ -1055,17 +952,12 @@ define @vfcopynsign_vf_nxv16f16( %vm, h ; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vxor.vx v12, v12, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1085,19 +977,12 @@ define @vfcopysign_vv_nxv32f16( %vm, @llvm.copysign.nxv32f16( %vm, %vs) ret %r @@ -1117,17 +1002,13 @@ define @vfcopysign_vf_nxv32f16( %vm, ha ; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vmv.v.v v28, v24 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v16, v24, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1144,29 +1025,13 @@ define @vfcopynsign_vv_nxv32f16( %vm, < ; ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v24, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v24, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v24, v24, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v16, v16, a0 +; ZVFHMIN-NEXT: vand.vx v16, v16, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %n = fneg %vs %r = call @llvm.copysign.nxv32f16( %vm, %n) @@ -1187,22 +1052,14 @@ define @vfcopynsign_vf_nxv32f16( %vm, h ; ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vmv.v.v v28, v24 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v16, v24, a0 +; ZVFHMIN-NEXT: addi a1, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v16, v16, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll index c835dc72268b3..725ac14b0e7a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll @@ -22,19 +22,16 @@ define @vfmsub_vv_nxv1f16( %va, %vc %vd = call @llvm.experimental.constrained.fma.nxv1f16( %va, %vb, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -55,18 +52,15 @@ define @vfmsub_vf_nxv1f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -86,19 +80,16 @@ define @vfmsub_vv_nxv2f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv2f16( %va, %vc, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -119,18 +110,15 @@ define @vfmsub_vf_nxv2f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -150,19 +138,16 @@ define @vfmsub_vv_nxv4f16( %va, %vc %vd = call @llvm.experimental.constrained.fma.nxv4f16( %vb, %va, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -183,16 +168,13 @@ define @vfmsub_vf_nxv4f16( %va, @vfmsub_vv_nxv8f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv8f16( %vb, %vc, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -247,16 +226,13 @@ define @vfmsub_vf_nxv8f16( %va, @vfmsub_vv_nxv16f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv16f16( %vc, %va, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -312,16 +299,13 @@ define @vfmsub_vf_nxv16f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @vfneg_vv_nxv1f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv1f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb @@ -37,12 +34,9 @@ define @vfneg_vv_nxv2f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb @@ -57,12 +51,9 @@ define @vfneg_vv_nxv4f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb @@ -77,12 +68,9 @@ define @vfneg_vv_nxv8f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfneg.v v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb @@ -97,12 +85,9 @@ define @vfneg_vv_nxv16f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb @@ -117,17 +102,9 @@ define @vfneg_vv_nxv32f16( %va) { ; ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret %vb = fneg %va ret %vb diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll index b54590cd9d844..2f41b59d6b225 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll @@ -22,24 +22,17 @@ define @vfnmsub_vv_nxv1f16( %va, %va %neg2 = fneg %vc @@ -61,23 +54,16 @@ define @vfnmsub_vf_nxv1f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -98,24 +84,17 @@ define @vfnmsub_vv_nxv2f16( %va, %va %neg2 = fneg %vb @@ -137,23 +116,16 @@ define @vfnmsub_vf_nxv2f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -174,24 +146,17 @@ define @vfnmsub_vv_nxv4f16( %va, %vb %neg2 = fneg %vc @@ -213,23 +178,16 @@ define @vfnmsub_vf_nxv4f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -250,24 +208,17 @@ define @vfnmsub_vv_nxv8f16( %va, %vb %neg2 = fneg %va @@ -289,23 +240,16 @@ define @vfnmsub_vf_nxv8f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -326,25 +270,17 @@ define @vfnmsub_vv_nxv16f16( %va, %vc %neg2 = fneg %vb @@ -361,29 +297,21 @@ define @vfnmsub_vf_nxv16f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -408,92 +336,79 @@ define @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16( %va, @vfnmsub_vv_nxv1f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv1f16( %neg, %vb, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -55,18 +52,15 @@ define @vfnmsub_vf_nxv1f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -86,19 +80,16 @@ define @vfnmsub_vv_nxv2f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv2f16( %neg, %vc, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -119,18 +110,15 @@ define @vfnmsub_vf_nxv2f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -150,19 +138,16 @@ define @vfnmsub_vv_nxv4f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv4f16( %neg, %va, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -183,18 +168,15 @@ define @vfnmsub_vf_nxv4f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -214,19 +196,16 @@ define @vfnmsub_vv_nxv8f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv8f16( %neg, %vc, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -247,18 +226,15 @@ define @vfnmsub_vf_nxv8f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -278,34 +254,16 @@ define @vfnmsub_vv_nxv16f16( %va, %vc %vd = call @llvm.experimental.constrained.fma.nxv16f16( %neg, %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -321,38 +279,20 @@ define @vfnmsub_vf_nxv16f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -376,77 +316,79 @@ define @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16( %va, Date: Tue, 3 Sep 2024 22:49:02 -0700 Subject: [PATCH 032/425] [RISCV][GISel] Use CCValAssign::getCustomReg for converting f16/f32<->GPR. (#105700) This gives us much better control of the generated code for GISel. I've tried to closely match the current gisel code, but it looks like we had 2 layers of G_ANYEXT in some cases before. SelectionDAG now checks needsCustom() instead of detecting the special cases in the Bitcast handler. Unfortunately, IRTranslator for bitcast still generates copies between register classes of different sizes. Because of this we can't handle i16<->f16 bitcasts without crashing. Not sure if I should teach RISCVInstrInfo::copyPhysReg to allow copies between FPR16 and GPR or if I should convert the copies to instructions in GISel. --- .../Target/RISCV/GISel/RISCVCallLowering.cpp | 61 +++++++++++------ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 68 +++++++++++++------ .../irtranslator/calling-conv-half.ll | 9 ++- 3 files changed, 92 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index c3cb1be963cab..6e33032384ede 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -109,15 +109,6 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler { void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) override { - // If we're passing a smaller fp value into a larger integer register, - // anyextend before copying. - if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) || - ((VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64) && - VA.getValVT() == MVT::f16)) { - LLT DstTy = LLT::scalar(VA.getLocVT().getSizeInBits()); - ValVReg = MIRBuilder.buildAnyExt(DstTy, ValVReg).getReg(0); - } - Register ExtReg = extendRegister(ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); @@ -126,16 +117,35 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler { unsigned assignCustomValue(CallLowering::ArgInfo &Arg, ArrayRef VAs, std::function *Thunk) override { + const CCValAssign &VA = VAs[0]; + if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) || + (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)) { + Register PhysReg = VA.getLocReg(); + + auto assignFunc = [=]() { + auto Trunc = MIRBuilder.buildAnyExt(LLT(VA.getLocVT()), Arg.Regs[0]); + MIRBuilder.buildCopy(PhysReg, Trunc); + MIB.addUse(PhysReg, RegState::Implicit); + }; + + if (Thunk) { + *Thunk = assignFunc; + return 1; + } + + assignFunc(); + return 1; + } + assert(VAs.size() >= 2 && "Expected at least 2 VAs."); - const CCValAssign &VALo = VAs[0]; const CCValAssign &VAHi = VAs[1]; assert(VAHi.needsCustom() && "Value doesn't need custom handling"); - assert(VALo.getValNo() == VAHi.getValNo() && + assert(VA.getValNo() == VAHi.getValNo() && "Values belong to different arguments"); - assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 && - VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 && + assert(VA.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 && + VA.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 && "unexpected custom value"); Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), @@ -154,7 +164,7 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler { } auto assignFunc = [=]() { - assignValueToReg(NewRegs[0], VALo.getLocReg(), VALo); + assignValueToReg(NewRegs[0], VA.getLocReg(), VA); if (VAHi.isRegLoc()) assignValueToReg(NewRegs[1], VAHi.getLocReg(), VAHi); }; @@ -258,16 +268,29 @@ struct RISCVIncomingValueHandler : public CallLowering::IncomingValueHandler { unsigned assignCustomValue(CallLowering::ArgInfo &Arg, ArrayRef VAs, std::function *Thunk) override { + const CCValAssign &VA = VAs[0]; + if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) || + (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)) { + Register PhysReg = VA.getLocReg(); + + markPhysRegUsed(PhysReg); + + LLT LocTy(VA.getLocVT()); + auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg); + + MIRBuilder.buildTrunc(Arg.Regs[0], Copy.getReg(0)); + return 1; + } + assert(VAs.size() >= 2 && "Expected at least 2 VAs."); - const CCValAssign &VALo = VAs[0]; const CCValAssign &VAHi = VAs[1]; assert(VAHi.needsCustom() && "Value doesn't need custom handling"); - assert(VALo.getValNo() == VAHi.getValNo() && + assert(VA.getValNo() == VAHi.getValNo() && "Values belong to different arguments"); - assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 && - VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 && + assert(VA.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 && + VA.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 && "unexpected custom value"); Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)), @@ -284,7 +307,7 @@ struct RISCVIncomingValueHandler : public CallLowering::IncomingValueHandler { const_cast(VAHi)); } - assignValueToReg(NewRegs[0], VALo.getLocReg(), VALo); + assignValueToReg(NewRegs[0], VA.getLocReg(), VA); if (VAHi.isRegLoc()) assignValueToReg(NewRegs[1], VAHi.getLocReg(), VAHi); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5089bbbe3c0d7..d46a08a442a01 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19226,6 +19226,19 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, // similar local variables rather than directly checking against the target // ABI. + ArrayRef ArgGPRs = RISCV::getArgGPRs(ABI); + + if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 || + (ValVT == MVT::f32 && XLen == 64))) { + Register Reg = State.AllocateReg(ArgGPRs); + if (Reg) { + LocVT = XLenVT; + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) { LocVT = XLenVT; @@ -19235,8 +19248,6 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, LocInfo = CCValAssign::BCvt; } - ArrayRef ArgGPRs = RISCV::getArgGPRs(ABI); - // If this is a variadic argument, the RISC-V calling convention requires // that it is assigned an 'even' or 'aligned' register if it has 8-byte // alignment (RV32) or 16-byte alignment (RV64). An aligned register should @@ -19483,6 +19494,17 @@ void RISCVTargetLowering::analyzeOutputArgs( static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, const CCValAssign &VA, const SDLoc &DL, const RISCVSubtarget &Subtarget) { + if (VA.needsCustom()) { + if (VA.getLocVT().isInteger() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val); + else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) + Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val); + else + llvm_unreachable("Unexpected Custom handling."); + return Val; + } + switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); @@ -19491,14 +19513,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget); break; case CCValAssign::BCvt: - if (VA.getLocVT().isInteger() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val); - } else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) { - Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val); - } else { - Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); - } + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } return Val; @@ -19544,6 +19559,17 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, const RISCVSubtarget &Subtarget) { EVT LocVT = VA.getLocVT(); + if (VA.needsCustom()) { + if (LocVT.isInteger() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val); + else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32) + Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val); + else + llvm_unreachable("Unexpected Custom handling."); + return Val; + } + switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); @@ -19552,14 +19578,7 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, Val = convertToScalableVector(LocVT, Val, DAG, Subtarget); break; case CCValAssign::BCvt: - if (LocVT.isInteger() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val); - } else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32) { - Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val); - } else { - Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val); - } + Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val); break; } return Val; @@ -19693,8 +19712,14 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, (LocVT == MVT::f64 && Subtarget.is64Bit() && Subtarget.hasStdExtZdinx())) { if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { - LocInfo = CCValAssign::BCvt; + if (LocVT.getSizeInBits() != Subtarget.getXLen()) { + LocVT = Subtarget.getXLenVT(); + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } LocVT = Subtarget.getXLenVT(); + LocInfo = CCValAssign::BCvt; State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; } @@ -20337,9 +20362,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Glue = RetValue2.getValue(2); RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue, RetValue2); - } - - RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget); + } else + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget); InVals.push_back(RetValue); } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll index 04fa62b195076..63bc43ae20e7b 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll @@ -1018,7 +1018,6 @@ define half @caller_half_return_stack2(half %x, half %y) nounwind { ; RV64IF-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16) ; RV64IF-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16) ; RV64IF-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16) - ; RV64IF-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; RV64IF-NEXT: $f10_f = COPY [[ANYEXT]](s32) ; RV64IF-NEXT: $f11_f = COPY [[ANYEXT1]](s32) ; RV64IF-NEXT: $f12_f = COPY [[ANYEXT2]](s32) @@ -1027,14 +1026,14 @@ define half @caller_half_return_stack2(half %x, half %y) nounwind { ; RV64IF-NEXT: $f15_f = COPY [[ANYEXT5]](s32) ; RV64IF-NEXT: $f16_f = COPY [[ANYEXT6]](s32) ; RV64IF-NEXT: $f17_f = COPY [[ANYEXT7]](s32) - ; RV64IF-NEXT: [[ANYEXT9:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT8]](s32) - ; RV64IF-NEXT: $x10 = COPY [[ANYEXT9]](s64) + ; RV64IF-NEXT: [[ANYEXT8:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16) + ; RV64IF-NEXT: $x10 = COPY [[ANYEXT8]](s64) ; RV64IF-NEXT: PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit $f11_f, implicit $f12_f, implicit $f13_f, implicit $f14_f, implicit $f15_f, implicit $f16_f, implicit $f17_f, implicit $x10, implicit-def $f10_f ; RV64IF-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 ; RV64IF-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f ; RV64IF-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; RV64IF-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16) - ; RV64IF-NEXT: $f10_f = COPY [[ANYEXT10]](s32) + ; RV64IF-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16) + ; RV64IF-NEXT: $f10_f = COPY [[ANYEXT9]](s32) ; RV64IF-NEXT: PseudoRET implicit $f10_f ; ; RV64IZFH-LABEL: name: caller_half_return_stack2 From 4a44898be5d46694b59aa411f2b45a52f2ce8411 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 4 Sep 2024 13:50:56 +0800 Subject: [PATCH 033/425] [RISCV] Add passthru to vmv.v.v intrinsic tests. NFC This prevents them from being optimized away in an upcoming peephole --- llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll | 430 ++++++++++++------------- 1 file changed, 215 insertions(+), 215 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll index 7217c2cfafca2..784b807a6a2e5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll @@ -9,17 +9,17 @@ declare @llvm.riscv.vmv.v.v.nxv1i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -29,17 +29,17 @@ declare @llvm.riscv.vmv.v.v.nxv2i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -49,17 +49,17 @@ declare @llvm.riscv.vmv.v.v.nxv4i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -69,17 +69,17 @@ declare @llvm.riscv.vmv.v.v.nxv8i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -89,17 +89,17 @@ declare @llvm.riscv.vmv.v.v.nxv16i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -109,17 +109,17 @@ declare @llvm.riscv.vmv.v.v.nxv32i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv32i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -129,17 +129,17 @@ declare @llvm.riscv.vmv.v.v.nxv64i8( , iXLen); -define @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv64i8( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -149,17 +149,17 @@ declare @llvm.riscv.vmv.v.v.nxv1i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -169,17 +169,17 @@ declare @llvm.riscv.vmv.v.v.nxv2i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -189,17 +189,17 @@ declare @llvm.riscv.vmv.v.v.nxv4i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -209,17 +209,17 @@ declare @llvm.riscv.vmv.v.v.nxv8i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -229,17 +229,17 @@ declare @llvm.riscv.vmv.v.v.nxv16i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -249,17 +249,17 @@ declare @llvm.riscv.vmv.v.v.nxv32i16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv32i16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -269,17 +269,17 @@ declare @llvm.riscv.vmv.v.v.nxv1i32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1i32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -289,17 +289,17 @@ declare @llvm.riscv.vmv.v.v.nxv2i32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2i32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -309,17 +309,17 @@ declare @llvm.riscv.vmv.v.v.nxv4i32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4i32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -329,17 +329,17 @@ declare @llvm.riscv.vmv.v.v.nxv8i32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8i32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -349,17 +349,17 @@ declare @llvm.riscv.vmv.v.v.nxv16i32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16i32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -369,17 +369,17 @@ declare @llvm.riscv.vmv.v.v.nxv1i64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1i64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -389,17 +389,17 @@ declare @llvm.riscv.vmv.v.v.nxv2i64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2i64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -409,17 +409,17 @@ declare @llvm.riscv.vmv.v.v.nxv4i64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4i64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -429,17 +429,17 @@ declare @llvm.riscv.vmv.v.v.nxv8i64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8i64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -449,17 +449,17 @@ declare @llvm.riscv.vmv.v.v.nxv1f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -469,17 +469,17 @@ declare @llvm.riscv.vmv.v.v.nxv2f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -489,17 +489,17 @@ declare @llvm.riscv.vmv.v.v.nxv4f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -509,17 +509,17 @@ declare @llvm.riscv.vmv.v.v.nxv8f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -529,17 +529,17 @@ declare @llvm.riscv.vmv.v.v.nxv16f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -549,17 +549,17 @@ declare @llvm.riscv.vmv.v.v.nxv32f16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv32f16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -569,17 +569,17 @@ declare @llvm.riscv.vmv.v.v.nxv1bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -589,17 +589,17 @@ declare @llvm.riscv.vmv.v.v.nxv2bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -609,17 +609,17 @@ declare @llvm.riscv.vmv.v.v.nxv4bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -629,17 +629,17 @@ declare @llvm.riscv.vmv.v.v.nxv8bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -649,17 +649,17 @@ declare @llvm.riscv.vmv.v.v.nxv16bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -669,17 +669,17 @@ declare @llvm.riscv.vmv.v.v.nxv32bf16( , iXLen); -define @intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv32bf16( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -689,17 +689,17 @@ declare @llvm.riscv.vmv.v.v.nxv1f32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1f32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -709,17 +709,17 @@ declare @llvm.riscv.vmv.v.v.nxv2f32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2f32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -729,17 +729,17 @@ declare @llvm.riscv.vmv.v.v.nxv4f32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4f32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -749,17 +749,17 @@ declare @llvm.riscv.vmv.v.v.nxv8f32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8f32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -769,17 +769,17 @@ declare @llvm.riscv.vmv.v.v.nxv16f32( , iXLen); -define @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv16f32( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -789,17 +789,17 @@ declare @llvm.riscv.vmv.v.v.nxv1f64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv1f64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -809,17 +809,17 @@ declare @llvm.riscv.vmv.v.v.nxv2f64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv2f64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -829,17 +829,17 @@ declare @llvm.riscv.vmv.v.v.nxv4f64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv4f64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } @@ -849,17 +849,17 @@ declare @llvm.riscv.vmv.v.v.nxv8f64( , iXLen); -define @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64( %0, iXLen %1) nounwind { +define @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vmv.v.v.nxv8f64( - undef, %0, - iXLen %1) + %1, + iXLen %2) ret %a } From 3449ed8dece600f387357b71ff74ae4bc46828b6 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 3 Sep 2024 22:47:34 -0700 Subject: [PATCH 034/425] Revert "[clang-format] Correctly annotate braces in macro definition (#106662)" This reverts commit 0fa78b6c7bd43c2498700a98c47a02cf4fd06388 due to regression. Fixes #107096. --- clang/lib/Format/UnwrappedLineParser.cpp | 3 ++- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 0d42a6c2bfb5c..246b29d308bfa 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -609,8 +609,9 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { ProbablyBracedList = NextTok->isNot(tok::l_square); } - // Cpp macro definition body containing nonempty braced list or block: + // Cpp macro definition body that is a nonempty braced list or block: if (IsCpp && Line->InMacroBody && PrevTok != FormatTok && + !FormatTok->Previous && NextTok->is(tok::eof) && // A statement can end with only `;` (simple statement), a block // closing brace (compound statement), or `:` (label statement). // If PrevTok is a block opening brace, Tok ends an empty block. diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index a2986f589396b..c0436d8a2e180 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3278,11 +3278,6 @@ TEST_F(TokenAnnotatorTest, BraceKind) { EXPECT_BRACE_KIND(Tokens[10], BK_Block); EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace); EXPECT_BRACE_KIND(Tokens[11], BK_Block); - - Tokens = annotate("#define MEMBER(NAME) NAME{\"\"}"); - ASSERT_EQ(Tokens.size(), 11u) << Tokens; - EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit); - EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); } TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) { From 7deda4ed0c712fb830d25f4e3090ff04f7adbcf9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 3 Sep 2024 23:13:52 -0700 Subject: [PATCH 035/425] [RISCV] Use MCRegister for variables returned from AllocateReg. NFC Avoids a cast from Register to MCRegister for the CCValAssign functions. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d46a08a442a01..8f95a86ade303 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19087,7 +19087,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, State.getMachineFunction().getSubtarget(); ArrayRef ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI()); - if (Register Reg = State.AllocateReg(ArgGPRs)) { + if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { // At least one half can be passed via register. State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, VA1.getLocVT(), CCValAssign::Full)); @@ -19108,7 +19108,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, return false; } - if (Register Reg = State.AllocateReg(ArgGPRs)) { + if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { // The second half can also be passed via register. State.addLoc( CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); @@ -19230,7 +19230,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 || (ValVT == MVT::f32 && XLen == 64))) { - Register Reg = State.AllocateReg(ArgGPRs); + MCRegister Reg = State.AllocateReg(ArgGPRs); if (Reg) { LocVT = XLenVT; State.addLoc( @@ -19283,7 +19283,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, // GPRs, split between a GPR and the stack, or passed completely on the // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these // cases. - Register Reg = State.AllocateReg(ArgGPRs); + MCRegister Reg = State.AllocateReg(ArgGPRs); if (!Reg) { unsigned StackOffset = State.AllocateStack(8, Align(8)); State.addLoc( @@ -19292,7 +19292,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, } LocVT = MVT::i32; State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - Register HiReg = State.AllocateReg(ArgGPRs); + MCRegister HiReg = State.AllocateReg(ArgGPRs); if (HiReg) { State.addLoc( CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo)); @@ -19340,7 +19340,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, } // Allocate to a register if possible, or else a stack slot. - Register Reg; + MCRegister Reg; unsigned StoreSizeBytes = XLen / 8; Align StackAlign = Align(XLen / 8); From 06286832db0c4ee1899f9cee1b8f6234e45f16c7 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 3 Sep 2024 23:29:03 -0700 Subject: [PATCH 036/425] Reland "Revert "AtomicExpand: Allow incrementally legalizing atomicrmw"" (#106793) Reverts llvm/llvm-project#106792 The first commit of PR is pure revert, the rest is a possible fix. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 35 +- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++++--------- 5 files changed, 836 insertions(+), 691 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 39a705599f90c..2da723a0cc175 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) { bool MadeChange = false; - SmallVector AtomicInsts; - - // Changing control-flow while iterating through it is a bad idea, so gather a - // list of all atomic instructions before we start. - for (Instruction &I : instructions(F)) - if (I.isAtomic() && !isa(&I)) - AtomicInsts.push_back(&I); - - for (auto *I : AtomicInsts) { - if (processAtomicInstr(I)) - MadeChange = true; + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) { + BasicBlock *BB = &*BBI; + ++BBI; + + BasicBlock::iterator Next; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + I = Next) { + Instruction &Inst = *I; + Next = std::next(I); + + if (processAtomicInstr(&Inst)) { + MadeChange = true; + + // Detect control flow change and resume iteration from the original + // block to inspect any newly inserted blocks. This allows incremental + // legalization of atomicrmw and cmpxchg. + if (Next == E || BB != Next->getParent()) { + BBI = BB->getIterator(); + BBE = F.end(); + break; + } + } + } } return MadeChange; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 0d230bb9dcc6e..ed9c1b037d0cc 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -43,46 +43,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -128,46 +131,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 @@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 @@ -399,35 +413,38 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 @@ -469,36 +486,40 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl __adddf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 @@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index bfe0d20ca814b..888b795876f7d 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -45,46 +45,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -130,46 +133,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 @@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 @@ -401,35 +415,38 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 @@ -471,36 +488,40 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl fmax -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 @@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 6b7d2df044460..a3665c6e42860 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -45,46 +45,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -130,46 +133,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 @@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 @@ -401,35 +415,38 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 @@ -471,36 +488,40 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl fmin -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 @@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 67e164037d5ce..7725ce0e73185 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -43,46 +43,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -128,46 +131,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 @@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 @@ -399,35 +413,38 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 @@ -469,36 +486,40 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl __subdf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 @@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload From 427e202a401514cb28bf2ca621baae8e1b2f552f Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Tue, 3 Sep 2024 23:54:36 -0700 Subject: [PATCH 037/425] [APInt] improve initialization performance (#106945) The purpose is to save an extra memset in both cases: 1. When `int64_t(val) < 0`, zeroing out is redundant as the subsequent for-loop will initialize to `val .. 0xFFFFF ....`. Instead we should only create an uninitialized buffer, and transform the slow for-loop into a memset to initialize the higher words to `0xFF`. 2. In the other case, first we create an uninitialized array (`new int64_t[]`) and _then_ we zero it out with `memset`. But this can be combined in one operation with `new int64_t[]()`, which default-initializes the array. On one example where use of APInt was heavy, this improved compile time by 1%. --- llvm/lib/Support/APInt.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 78d573966c6c9..2348a4c9b795e 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -34,9 +34,7 @@ using namespace llvm; /// A utility function for allocating memory, checking for allocation failures, /// and ensuring the contents are zeroed. inline static uint64_t* getClearedMemory(unsigned numWords) { - uint64_t *result = new uint64_t[numWords]; - memset(result, 0, numWords * sizeof(uint64_t)); - return result; + return new uint64_t[numWords](); } /// A utility function for allocating memory and checking for allocation @@ -74,12 +72,15 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) { void APInt::initSlowCase(uint64_t val, bool isSigned) { - U.pVal = getClearedMemory(getNumWords()); - U.pVal[0] = val; - if (isSigned && int64_t(val) < 0) - for (unsigned i = 1; i < getNumWords(); ++i) - U.pVal[i] = WORDTYPE_MAX; - clearUnusedBits(); + if (isSigned && int64_t(val) < 0) { + U.pVal = getMemory(getNumWords()); + U.pVal[0] = val; + memset(&U.pVal[1], 0xFF, APINT_WORD_SIZE * (getNumWords() - 1)); + clearUnusedBits(); + } else { + U.pVal = getClearedMemory(getNumWords()); + U.pVal[0] = val; + } } void APInt::initSlowCase(const APInt& that) { From 4bccb01355edcfedacafede3e7878d74e2b0a28f Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 4 Sep 2024 10:02:55 +0200 Subject: [PATCH 038/425] [Clang] Workaround dependent source location issues (#106925) In #78436 we made some SourceLocExpr dependent to deal with the fact that their value should reflect the name of specialized function - rather than the rtemplate in which they are first used. However SourceLocExpr are unusual in two ways - They don't depend on template arguments - They morally depend on the context in which they are used (rather than called from). It's fair to say that this is quite novels and confuses clang. In particular, in some cases, we used to create dependent SourceLocExpr and never subsequently transform them, leaving dependent objects in instantiated functions types. To work around that we avoid replacing SourceLocExpr when we think they could remain dependent. It's certainly not perfect but it fixes a number of reported bugs, and seem to only affect scenarios in which the value of the SourceLocExpr does not matter (overload resolution). Fixes #106428 Fixes #81155 Fixes #80210 Fixes #85373 --------- Co-authored-by: Aaron Ballman --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/SemaExpr.cpp | 21 +++++++-- clang/test/SemaCXX/source_location.cpp | 60 ++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0eee71d00a2c5..45b08128600ef 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -342,6 +342,8 @@ Bug Fixes to C++ Support specialization right before its declaration context. (#GH64082) - Fixed a constraint comparison bug for friend declarations. (#GH78101) - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024) +- Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373) + Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 94bb938b53b44..e291ef6c97eef 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5443,11 +5443,24 @@ struct EnsureImmediateInvocationInDefaultArgs // Rewrite to source location to refer to the context in which they are used. ExprResult TransformSourceLocExpr(SourceLocExpr *E) { - if (E->getParentContext() == SemaRef.CurContext) + DeclContext *DC = E->getParentContext(); + if (DC == SemaRef.CurContext) return E; - return getDerived().RebuildSourceLocExpr(E->getIdentKind(), E->getType(), - E->getBeginLoc(), E->getEndLoc(), - SemaRef.CurContext); + + // FIXME: During instantiation, because the rebuild of defaults arguments + // is not always done in the context of the template instantiator, + // we run the risk of producing a dependent source location + // that would never be rebuilt. + // This usually happens during overload resolution, or in contexts + // where the value of the source location does not matter. + // However, we should find a better way to deal with source location + // of function templates. + if (!SemaRef.CurrentInstantiationScope || + !SemaRef.CurContext->isDependentContext() || DC->isDependentContext()) + DC = SemaRef.CurContext; + + return getDerived().RebuildSourceLocExpr( + E->getIdentKind(), E->getType(), E->getBeginLoc(), E->getEndLoc(), DC); } }; diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp index 6b3610d703e71..34177bfe287fc 100644 --- a/clang/test/SemaCXX/source_location.cpp +++ b/clang/test/SemaCXX/source_location.cpp @@ -929,3 +929,63 @@ void test() { } } + +namespace GH106428 { + +struct add_fn { + template + constexpr auto operator()(T lhs, T rhs, + const std::source_location loc = std::source_location::current()) + const -> T + { + return lhs + rhs; + } +}; + + +template +decltype(_Fp{}(0, 0)) +__invoke(_Fp&& __f); + +template +struct type_identity { using type = T; }; + +template +struct invoke_result : type_identity {}; + +using i = invoke_result::type; +static_assert(__is_same(i, int)); + +} + +#if __cplusplus >= 202002L + +namespace GH81155 { +struct buff { + buff(buff &, const char * = __builtin_FUNCTION()); +}; + +template +Ty declval(); + +template +auto Call(buff arg) -> decltype(Fx{}(arg)); + +template +struct F {}; + +template +struct InvocableR : F(declval()))> { + static constexpr bool value = false; +}; + +template ::value> +void Help(Fx) {} + +void Test() { + Help([](buff) {}); +} + +} + +#endif From de37da8e37c4c9042563e186068adca98bf59e07 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 4 Sep 2024 09:06:48 +0100 Subject: [PATCH 039/425] [MachineOutliner] Preserve instruction bundles (#106402) When the machine outliner copies instructions from a source function into an outlined function, it was doing it using `CloneMachineInstr`, which is documented as not preserving the interior of any instruction bundle. So outlining code that includes an instruction bundle would fail, because in the outlined version, the bundle would be empty, so instructions would go missing in the move. This occurs when any bundled instruction appears in the outlined code, so there was no need to construct an unusual test case: I've just copied a function from the existing `stp-opt-with-renaming.mir`, which happens to contain an SVE instruction bundle. Including two identical copies of that function makes the outliner merge them, and then we check that it didn't destroy the interior of the bundle in the process. --- llvm/lib/CodeGen/MachineOutliner.cpp | 7 ++- .../AArch64/machine-outliner-bundle.mir | 54 +++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index 42f410c277179..97500d0abebad 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -763,10 +763,9 @@ MachineFunction *MachineOutliner::createOutlinedFunction( BuildMI(MBB, MBB.end(), DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(MF.addFrameInst(CFI)); } else { - MachineInstr *NewMI = MF.CloneMachineInstr(&MI); - NewMI->dropMemRefs(MF); - NewMI->setDebugLoc(DL); - MBB.insert(MBB.end(), NewMI); + MachineInstr &NewMI = TII.duplicate(MBB, MBB.end(), MI); + NewMI.dropMemRefs(MF); + NewMI.setDebugLoc(DL); } } diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir b/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir new file mode 100644 index 0000000000000..1dd5b0811bdfb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir @@ -0,0 +1,54 @@ +# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +# CHECK: name: OUTLINED_FUNCTION_0 +# CHECK-NOT: name: +# CHECK: BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 { +# CHECK: $z3 = MOVPRFX_ZZ $z19 +# CHECK: $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16 +# CHECK: } + +--- +name: bundled +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: + hasRedZone: false +body: | + bb.0.entry: + liveins: $z3, $z19, $p0, $z16 + renamable $q0 = LDRQui $sp, 1 :: (load 16) + STRSui renamable $s0, $sp, 9, implicit killed $q0 :: (store (s32)) + BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 { + $z3 = MOVPRFX_ZZ $z19 + $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16 + } + renamable $q0 = LDRQui $sp, 0 :: (load 16, align 32) + STRSui renamable $s0, $sp, 10, implicit killed $q0 :: (store (s32)) + RET undef $lr +... +--- +name: bundled_clone +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: + hasRedZone: false +body: | + bb.0.entry: + liveins: $z3, $z19, $p0, $z16 + renamable $q0 = LDRQui $sp, 1 :: (load 16) + STRSui renamable $s0, $sp, 9, implicit killed $q0 :: (store (s32)) + BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 { + $z3 = MOVPRFX_ZZ $z19 + $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16 + } + renamable $q0 = LDRQui $sp, 0 :: (load 16, align 32) + STRSui renamable $s0, $sp, 10, implicit killed $q0 :: (store (s32)) + RET undef $lr +... From 01e56849001b4ace984e9557abc82bc051e03677 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Wed, 4 Sep 2024 10:09:04 +0200 Subject: [PATCH 040/425] [clang] Respect the lifetimebound in assignment operator. (#106997) Fixes #106372 --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/CheckExprLifetime.cpp | 64 +++++++++++++++-------- clang/lib/Sema/CheckExprLifetime.h | 1 + clang/lib/Sema/SemaOverload.cpp | 4 +- clang/test/SemaCXX/attr-lifetimebound.cpp | 20 +++++++ 5 files changed, 67 insertions(+), 24 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 45b08128600ef..511724c73015e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -268,6 +268,8 @@ Improvements to Clang's diagnostics - Improved diagnostic when trying to overload a function in an ``extern "C"`` context. (#GH80235) +- Clang now respects lifetimebound attribute for the assignment operator parameter. (#GH106372). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index f28789dba34e1..f7540a6e3a897 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -326,24 +326,11 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { return false; } -static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { - const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); - if (!TSI) - return false; - // Don't declare this variable in the second operand of the for-statement; - // GCC miscompiles that by ending its lifetime before evaluating the - // third operand. See gcc.gnu.org/PR86769. - AttributedTypeLoc ATL; - for (TypeLoc TL = TSI->getTypeLoc(); - (ATL = TL.getAsAdjusted()); - TL = ATL.getModifiedLoc()) { - if (ATL.getAttrAs()) - return true; - } - - // Assume that all assignment operators with a "normal" return type return - // *this, that is, an lvalue reference that is the same type as the implicit - // object parameter (or the LHS for a non-member operator$=). +// Return true if this is an "normal" assignment operator. +// We assuments that a normal assingment operator always returns *this, that is, +// an lvalue reference that is the same type as the implicit object parameter +// (or the LHS for a non-member operator$=). +static bool isNormalAsisgnmentOperator(const FunctionDecl *FD) { OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator(); if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) { QualType RetT = FD->getReturnType(); @@ -359,10 +346,27 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { return true; } } - return false; } +static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { + const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); + if (!TSI) + return false; + // Don't declare this variable in the second operand of the for-statement; + // GCC miscompiles that by ending its lifetime before evaluating the + // third operand. See gcc.gnu.org/PR86769. + AttributedTypeLoc ATL; + for (TypeLoc TL = TSI->getTypeLoc(); + (ATL = TL.getAsAdjusted()); + TL = ATL.getModifiedLoc()) { + if (ATL.getAttrAs()) + return true; + } + + return isNormalAsisgnmentOperator(FD); +} + // Visit lifetimebound or gsl-pointer arguments. static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, LocalVisitor Visit, @@ -968,6 +972,22 @@ static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) { return false; } +static bool isAssginmentOperatorLifetimeBound(CXXMethodDecl *CMD) { + if (!CMD) + return false; + return isNormalAsisgnmentOperator(CMD) && CMD->param_size() == 1 && + CMD->getParamDecl(0)->hasAttr(); +} + +static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef, + const AssignedEntity &Entity) { + bool EnableGSLAssignmentWarnings = !SemaRef.getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer_assignment, SourceLocation()); + return (EnableGSLAssignmentWarnings && + (isRecordWithAttr(Entity.LHS->getType()) || + isAssginmentOperatorLifetimeBound(Entity.AssignmentOperator))); +} + static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, const InitializedEntity *ExtendingEntity, @@ -1267,8 +1287,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, }; llvm::SmallVector Path; - if (EnableLifetimeWarnings && LK == LK_Assignment && - isRecordWithAttr(AEntity->LHS->getType())) + if (LK == LK_Assignment && shouldRunGSLAssignmentAnalysis(SemaRef, *AEntity)) Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init}); if (Init->isGLValue()) @@ -1301,8 +1320,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, diag::warn_dangling_pointer_assignment, SourceLocation()); bool RunAnalysis = (EnableDanglingPointerAssignment && Entity.LHS->getType()->isPointerType()) || - (EnableLifetimeWarnings && - isRecordWithAttr(Entity.LHS->getType())); + shouldRunGSLAssignmentAnalysis(SemaRef, Entity); if (!RunAnalysis) return; diff --git a/clang/lib/Sema/CheckExprLifetime.h b/clang/lib/Sema/CheckExprLifetime.h index af381fb96c4d6..8c8d0806dee0a 100644 --- a/clang/lib/Sema/CheckExprLifetime.h +++ b/clang/lib/Sema/CheckExprLifetime.h @@ -22,6 +22,7 @@ namespace clang::sema { struct AssignedEntity { // The left-hand side expression of the assignment. Expr *LHS = nullptr; + CXXMethodDecl *AssignmentOperator = nullptr; }; /// Check that the lifetime of the given expr (and its subobjects) is diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 95551173df91a..861b0a91240b3 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14768,7 +14768,9 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // Check for a self move. DiagnoseSelfMove(Args[0], Args[1], OpLoc); // lifetime check. - checkExprLifetime(*this, AssignedEntity{Args[0]}, Args[1]); + checkExprLifetime( + *this, AssignedEntity{Args[0], dyn_cast(FnDecl)}, + Args[1]); } if (ImplicitThis) { QualType ThisType = Context.getPointerType(ImplicitThis->getType()); diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp index 6566ed6270cd1..0fb997a567108 100644 --- a/clang/test/SemaCXX/attr-lifetimebound.cpp +++ b/clang/test/SemaCXX/attr-lifetimebound.cpp @@ -287,3 +287,23 @@ std::span test2() { return abc; // expected-warning {{address of stack memory associated with local variable}} } } // namespace ctor_cases + +namespace GH106372 { +class [[gsl::Owner]] Foo {}; +class [[gsl::Pointer]] FooView {}; + +class NonAnnotatedFoo {}; +class NonAnnotatedFooView {}; + +template +struct StatusOr { + template + StatusOr& operator=(U&& v [[clang::lifetimebound]]); +}; + +void test(StatusOr foo1, StatusOr foo2) { + foo1 = Foo(); // expected-warning {{object backing the pointer foo1 will be destroyed at the end}} + // No warning on non-gsl annotated types. + foo2 = NonAnnotatedFoo(); +} +} // namespace GH106372 From 771b7af1db15e59f370ccadaa98bee8e5270b5f1 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 4 Sep 2024 10:13:47 +0200 Subject: [PATCH 041/425] =?UTF-8?q?Reapply=20"[llvm/DWARF]=20Recursively?= =?UTF-8?q?=20resolve=20DW=5FAT=5Fsignature=20references"=E2=80=A6=20(#994?= =?UTF-8?q?95)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … (#99444) The previous version introduced a bug (caught by cross-project tests). Explicit signature resolution is still necessary when one wants to access the children (not attributes) of a given DIE. The new version keeps just the findRecursively extension, and reverts all the DWARFTypePrinter modifications. --- llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 13 ++-- .../DebugInfo/DWARF/DWARFDieTest.cpp | 61 +++++++++++++++++++ 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 5daa093ee8a1b..9c26c4f8892b0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -291,13 +291,12 @@ DWARFDie::findRecursively(ArrayRef Attrs) const { if (auto Value = Die.find(Attrs)) return Value; - if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin)) - if (Seen.insert(D).second) - Worklist.push_back(D); - - if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification)) - if (Seen.insert(D).second) - Worklist.push_back(D); + for (dwarf::Attribute Attr : + {DW_AT_abstract_origin, DW_AT_specification, DW_AT_signature}) { + if (auto D = Die.getAttributeValueAsReferencedDie(Attr)) + if (Seen.insert(D).second) + Worklist.push_back(D); + } } return std::nullopt; diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp index e1057b214ee4d..485ec720ffad6 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp @@ -643,4 +643,65 @@ TEST(DWARFDie, getDeclFileSpecificationAcrossCUBoundary) { EXPECT_EQ(DeclFile, Ref); } +TEST(DWARFDie, getNameFromTypeUnit) { + const char *yamldata = R"( + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + - Code: 0x2 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_signature + Form: DW_FORM_ref_sig8 + - Code: 0x3 + Tag: DW_TAG_type_unit + Children: DW_CHILDREN_yes + - Code: 0x4 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AbbrevTableID: 0 + Entries: + - AbbrCode: 0x1 + - AbbrCode: 0x2 + Values: + - Value: 0xdeadbeefbaadf00d + - AbbrCode: 0x0 + - Version: 5 + UnitType: DW_UT_type + AbbrevTableID: 0 + TypeSignature: 0xdeadbeefbaadf00d + TypeOffset: 25 + Entries: + - AbbrCode: 0x3 + - AbbrCode: 0x4 + Values: + - CStr: "STRUCT" + - AbbrCode: 0x0 + )"; + + Expected>> Sections = + DWARFYAML::emitDebugSections(StringRef(yamldata), + /*IsLittleEndian=*/true, + /*Is64BitAddrSize=*/true); + ASSERT_THAT_EXPECTED(Sections, Succeeded()); + std::unique_ptr Ctx = + DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true); + DWARFCompileUnit *CU = Ctx->getCompileUnitForOffset(0); + ASSERT_NE(nullptr, CU); + DWARFDie Die = CU->getUnitDIE(/*ExtractUnitDIEOnly=*/false).getFirstChild(); + ASSERT_TRUE(Die.isValid()); + + ASSERT_STREQ(Die.getName(DINameKind::ShortName), "STRUCT"); +} + } // end anonymous namespace From 009184fc3920f8a14dff9971edf68754ba28da5f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 01:28:20 -0700 Subject: [PATCH 042/425] [ThinLTO] Avoid repeated std::map lookups (NFC) (#107156) This patch avoids repeated std::map lookups with try_emplace. While I am at it, this patch adds a couple of calls to std::vector::reserve. --- llvm/include/llvm/IR/ModuleSummaryIndexYAML.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h index b2747d24c5396..6cc533f043a51 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -214,15 +214,13 @@ template <> struct CustomMappingTraits { io.setError("key not an integer"); return; } - if (!V.count(KeyInt)) - V.emplace(KeyInt, /*IsAnalysis=*/false); - auto &Elem = V.find(KeyInt)->second; + auto &Elem = V.try_emplace(KeyInt, /*IsAnalysis=*/false).first->second; for (auto &FSum : FSums) { std::vector Refs; + Refs.reserve(FSum.Refs.size()); for (auto &RefGUID : FSum.Refs) { - if (!V.count(RefGUID)) - V.emplace(RefGUID, /*IsAnalysis=*/false); - Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID))); + auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first; + Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*It)); } Elem.SummaryList.push_back(std::make_unique( GlobalValueSummary::GVFlags( @@ -247,6 +245,7 @@ template <> struct CustomMappingTraits { for (auto &Sum : P.second.SummaryList) { if (auto *FSum = dyn_cast(Sum.get())) { std::vector Refs; + Refs.reserve(FSum->refs().size()); for (auto &VI : FSum->refs()) Refs.push_back(VI.getGUID()); FSums.push_back(FunctionSummaryYaml{ From 8bfd6b953fc119bbc37c1755e701261fcfb31ad2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 01:28:49 -0700 Subject: [PATCH 043/425] [SSAUpdater] Use DenseMap::operator[] (NFC) (#107179) I'm planning to deprecate DenseMap::FindAndConstruct in favor of DenseMap::operator[]. I thought about renaming the variable to PredInfo, but the name is taken, so I am leaving the name as is. --- llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index 010d6b0de9f6a..746926e5bee33 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -139,17 +139,16 @@ class SSAUpdaterImpl { for (unsigned p = 0; p != Info->NumPreds; ++p) { BlkT *Pred = Preds[p]; // Check if BBMap already has a BBInfo for the predecessor block. - typename BBMapTy::value_type &BBMapBucket = - BBMap.FindAndConstruct(Pred); - if (BBMapBucket.second) { - Info->Preds[p] = BBMapBucket.second; + BBInfo *&BBMapBucket = BBMap[Pred]; + if (BBMapBucket) { + Info->Preds[p] = BBMapBucket; continue; } // Create a new BBInfo for the predecessor. ValT PredVal = AvailableVals->lookup(Pred); BBInfo *PredInfo = new (Allocator) BBInfo(Pred, PredVal); - BBMapBucket.second = PredInfo; + BBMapBucket = PredInfo; Info->Preds[p] = PredInfo; if (PredInfo->AvailableVal) { From e99eb89d5d97efc709f18f9369f2ec087352baaa Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 01:29:13 -0700 Subject: [PATCH 044/425] [SimplifyCFG] Use range-based for loops (NFC) (#107180) --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 15de40c7b0996..f9db996cdc358 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1000,20 +1000,20 @@ bool SimplifyCFGOpt::simplifyEqualityComparisonWithOnlyPredecessor( // which value (or set of values) this is. ConstantInt *TIV = nullptr; BasicBlock *TIBB = TI->getParent(); - for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].Dest == TIBB) { + for (const auto &[Value, Dest] : PredCases) + if (Dest == TIBB) { if (TIV) return false; // Cannot handle multiple values coming to this block. - TIV = PredCases[i].Value; + TIV = Value; } assert(TIV && "No edge from pred to succ?"); // Okay, we found the one constant that our value can be if we get into TI's // BB. Find out which successor will unconditionally be branched to. BasicBlock *TheRealDest = nullptr; - for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) - if (ThisCases[i].Value == TIV) { - TheRealDest = ThisCases[i].Dest; + for (const auto &[Value, Dest] : ThisCases) + if (Value == TIV) { + TheRealDest = Dest; break; } From aacdc657fc255b2547bb37ee9bacde2df0452298 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Wed, 4 Sep 2024 16:34:27 +0800 Subject: [PATCH 045/425] [Clang] Preserve the ExpandsToEmpty flag in PackIndexingType (#107181) Similar to PackIndexingExpr, we should avoid another round of transformation of the pattern if the pattern has already turned out to be an empty pack. As an outcome, the empty SubstTemplateTypeParmPackType won't occur, and we don't need to collect any unexpanded packs. Fixes https://github.com/llvm/llvm-project/issues/105903 --- clang/docs/ReleaseNotes.rst | 2 +- clang/include/clang/AST/Type.h | 13 +++++++++---- clang/include/clang/AST/TypeProperties.td | 5 ++++- clang/lib/AST/ASTContext.cpp | 12 +++++++----- clang/lib/AST/Type.cpp | 7 ++++--- clang/lib/Sema/TreeTransform.h | 4 ++-- clang/test/SemaCXX/cxx2c-pack-indexing.cpp | 12 ++++++++++++ 7 files changed, 39 insertions(+), 16 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 511724c73015e..4128ca78ce396 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -345,7 +345,7 @@ Bug Fixes to C++ Support - Fixed a constraint comparison bug for friend declarations. (#GH78101) - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024) - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373) - +- Fixed a bug in the substitution of empty pack indexing types. (#GH105903) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 08f7638d7d8f9..853226118af40 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -5828,12 +5828,15 @@ class PackIndexingType final QualType Pattern; Expr *IndexExpr; - unsigned Size; + unsigned Size : 31; + + LLVM_PREFERRED_TYPE(bool) + unsigned ExpandsToEmptyPack : 1; protected: friend class ASTContext; // ASTContext creates these. PackIndexingType(const ASTContext &Context, QualType Canonical, - QualType Pattern, Expr *IndexExpr, + QualType Pattern, Expr *IndexExpr, bool ExpandsToEmptyPack, ArrayRef Expansions = {}); public: @@ -5857,6 +5860,8 @@ class PackIndexingType final bool hasSelectedType() const { return getSelectedIndex() != std::nullopt; } + bool expandsToEmptyPack() const { return ExpandsToEmptyPack; } + ArrayRef getExpansions() const { return {getExpansionsPtr(), Size}; } @@ -5869,10 +5874,10 @@ class PackIndexingType final if (hasSelectedType()) getSelectedType().Profile(ID); else - Profile(ID, Context, getPattern(), getIndexExpr()); + Profile(ID, Context, getPattern(), getIndexExpr(), expandsToEmptyPack()); } static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context, - QualType Pattern, Expr *E); + QualType Pattern, Expr *E, bool ExpandsToEmptyPack); private: const QualType *getExpansionsPtr() const { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 3df19315fd573..539a344cb0b69 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -473,9 +473,12 @@ let Class = PackIndexingType in { def : Property<"indexExpression", ExprRef> { let Read = [{ node->getIndexExpr() }]; } + def : Property<"expandsToEmptyPack", Bool> { + let Read = [{ node->expandsToEmptyPack() }]; + } def : Creator<[{ - return ctx.getPackIndexingType(pattern, indexExpression); + return ctx.getPackIndexingType(pattern, indexExpression, expandsToEmptyPack); }]>; } diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index c61234aa4d1af..341ea98a1b149 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -6188,11 +6188,13 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, ArrayRef Expansions, int Index) const { QualType Canonical; + bool ExpandsToEmptyPack = FullySubstituted && Expansions.empty(); if (FullySubstituted && Index != -1) { Canonical = getCanonicalType(Expansions[Index]); } else { llvm::FoldingSetNodeID ID; - PackIndexingType::Profile(ID, *this, Pattern, IndexExpr); + PackIndexingType::Profile(ID, *this, Pattern, IndexExpr, + ExpandsToEmptyPack); void *InsertPos = nullptr; PackIndexingType *Canon = DependentPackIndexingTypes.FindNodeOrInsertPos(ID, InsertPos); @@ -6200,8 +6202,8 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, void *Mem = Allocate( PackIndexingType::totalSizeToAlloc(Expansions.size()), TypeAlignment); - Canon = new (Mem) - PackIndexingType(*this, QualType(), Pattern, IndexExpr, Expansions); + Canon = new (Mem) PackIndexingType(*this, QualType(), Pattern, IndexExpr, + ExpandsToEmptyPack, Expansions); DependentPackIndexingTypes.InsertNode(Canon, InsertPos); } Canonical = QualType(Canon, 0); @@ -6210,8 +6212,8 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, void *Mem = Allocate(PackIndexingType::totalSizeToAlloc(Expansions.size()), TypeAlignment); - auto *T = new (Mem) - PackIndexingType(*this, Canonical, Pattern, IndexExpr, Expansions); + auto *T = new (Mem) PackIndexingType(*this, Canonical, Pattern, IndexExpr, + ExpandsToEmptyPack, Expansions); Types.push_back(T); return QualType(T, 0); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index e89ce2e4b3844..b976d1a0ee60a 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -3992,12 +3992,12 @@ void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID, PackIndexingType::PackIndexingType(const ASTContext &Context, QualType Canonical, QualType Pattern, - Expr *IndexExpr, + Expr *IndexExpr, bool ExpandsToEmptyPack, ArrayRef Expansions) : Type(PackIndexing, Canonical, computeDependence(Pattern, IndexExpr, Expansions)), Context(Context), Pattern(Pattern), IndexExpr(IndexExpr), - Size(Expansions.size()) { + Size(Expansions.size()), ExpandsToEmptyPack(ExpandsToEmptyPack) { std::uninitialized_copy(Expansions.begin(), Expansions.end(), getTrailingObjects()); @@ -4042,9 +4042,10 @@ PackIndexingType::computeDependence(QualType Pattern, Expr *IndexExpr, void PackIndexingType::Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context, QualType Pattern, - Expr *E) { + Expr *E, bool ExpandsToEmptyPack) { Pattern.Profile(ID); E->Profile(ID, Context, true); + ID.AddBoolean(ExpandsToEmptyPack); } UnaryTransformType::UnaryTransformType(QualType BaseType, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 66e3f27fed9de..27eac401c28f5 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -6687,10 +6687,10 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, bool NotYetExpanded = Types.empty(); bool FullySubstituted = true; - if (Types.empty()) + if (Types.empty() && !PIT->expandsToEmptyPack()) Types = llvm::ArrayRef(&Pattern, 1); - for (const QualType &T : Types) { + for (QualType T : Types) { if (!T->containsUnexpandedParameterPack()) { QualType Transformed = getDerived().TransformType(T); if (Transformed.isNull()) diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp index 7d7e808746217..962dbb8137f28 100644 --- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp +++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp @@ -258,4 +258,16 @@ void f() { vars<0>::x<0>(); } +} // namespace GH105900 + +namespace GH105903 { + +template struct temp { + template static auto x() -> opts... [s] {} // expected-note {{invalid index 0 for pack 'opts' of size 0}} +}; + +void f() { + temp<>::x<0>(); // expected-error {{no matching}} } + +} // namespace GH105903 From 331f8225f37b714a4df7ff3176b574b756f4d965 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Wed, 4 Sep 2024 11:35:18 +0300 Subject: [PATCH 046/425] [mlir][AsmParser] Expose `parseMinus()` (#106881) Found while working on parser for custom expression type for my dialect. Builtin `AffineExpr` uses low-level parser API which is not available for external users. --- mlir/include/mlir/IR/OpImplementation.h | 6 ++++++ mlir/lib/AsmParser/AsmParserImpl.h | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h index ae412c7227f8e..e2472eea8a371 100644 --- a/mlir/include/mlir/IR/OpImplementation.h +++ b/mlir/include/mlir/IR/OpImplementation.h @@ -641,6 +641,12 @@ class AsmParser { /// Parse a '+' token if present. virtual ParseResult parseOptionalPlus() = 0; + /// Parse a '-' token. + virtual ParseResult parseMinus() = 0; + + /// Parse a '-' token if present. + virtual ParseResult parseOptionalMinus() = 0; + /// Parse a '*' token. virtual ParseResult parseStar() = 0; diff --git a/mlir/lib/AsmParser/AsmParserImpl.h b/mlir/lib/AsmParser/AsmParserImpl.h index b12687833e3fd..04250f63dcd25 100644 --- a/mlir/lib/AsmParser/AsmParserImpl.h +++ b/mlir/lib/AsmParser/AsmParserImpl.h @@ -226,6 +226,16 @@ class AsmParserImpl : public BaseT { return success(parser.consumeIf(Token::plus)); } + /// Parses a '-' token. + ParseResult parseMinus() override { + return parser.parseToken(Token::minus, "expected '-'"); + } + + /// Parses a '-' token if present. + ParseResult parseOptionalMinus() override { + return success(parser.consumeIf(Token::minus)); + } + /// Parse a '|' token. ParseResult parseVerticalBar() override { return parser.parseToken(Token::vertical_bar, "expected '|'"); From 8133d47632f35df00933bfd3d3626b003206ede4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 01:35:47 -0700 Subject: [PATCH 047/425] [CGOpenMPRuntime] Use DenseMap::operator[] (NFC) (#107185) I'm planning to deprecate DenseMap::FindAndConstruct in favor of DenseMap::operator[]. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 29 +++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 23b977be81602..3d392d869ee39 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1327,25 +1327,24 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction( void CGOpenMPRuntime::setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint) { - auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); - assert(!Elem.second.ServiceInsertPt && "Insert point is set already."); + auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn]; + assert(!Elem.ServiceInsertPt && "Insert point is set already."); llvm::Value *Undef = llvm::UndefValue::get(CGF.Int32Ty); if (AtCurrentPoint) { - Elem.second.ServiceInsertPt = new llvm::BitCastInst( - Undef, CGF.Int32Ty, "svcpt", CGF.Builder.GetInsertBlock()); + Elem.ServiceInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt", + CGF.Builder.GetInsertBlock()); } else { - Elem.second.ServiceInsertPt = - new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt"); - Elem.second.ServiceInsertPt->insertAfter(CGF.AllocaInsertPt); + Elem.ServiceInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt"); + Elem.ServiceInsertPt->insertAfter(CGF.AllocaInsertPt); } } void CGOpenMPRuntime::clearLocThreadIdInsertPt(CodeGenFunction &CGF) { - auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); - if (Elem.second.ServiceInsertPt) { - llvm::Instruction *Ptr = Elem.second.ServiceInsertPt; - Elem.second.ServiceInsertPt = nullptr; + auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn]; + if (Elem.ServiceInsertPt) { + llvm::Instruction *Ptr = Elem.ServiceInsertPt; + Elem.ServiceInsertPt = nullptr; Ptr->eraseFromParent(); } } @@ -1441,18 +1440,18 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF, // kmpc_global_thread_num(ident_t *loc). // Generate thread id value and cache this value for use across the // function. - auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); - if (!Elem.second.ServiceInsertPt) + auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn]; + if (!Elem.ServiceInsertPt) setLocThreadIdInsertPt(CGF); CGBuilderTy::InsertPointGuard IPG(CGF.Builder); - CGF.Builder.SetInsertPoint(Elem.second.ServiceInsertPt); + CGF.Builder.SetInsertPoint(Elem.ServiceInsertPt); auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, Loc); llvm::CallInst *Call = CGF.Builder.CreateCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL___kmpc_global_thread_num), emitUpdateLocation(CGF, Loc)); Call->setCallingConv(CGF.getRuntimeCC()); - Elem.second.ThreadID = Call; + Elem.ThreadID = Call; return Call; } From 12d678a8eb11821e20eab86445f0cc9b66c24990 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 4 Sep 2024 09:37:15 +0100 Subject: [PATCH 048/425] [AArch64] Add codegen tests for zext(deinterleave). NFC --- llvm/test/CodeGen/AArch64/zext-shuffle.ll | 561 ++++++++++++++++++++++ 1 file changed, 561 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/zext-shuffle.ll diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll new file mode 100644 index 0000000000000..4ef8daf141715 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -0,0 +1,561 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -o - %s | FileCheck %s + +define <2 x i64> @v2i64_02(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_02: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <2 x i64> @v2i64_13(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_13: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <2 x i64> @v2i64_04812(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_04812: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <2 x i64> @v2i64_15913(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_15913: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <2 x i64> @v2i64_261014(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_261014: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: v2i64_37: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> + %d = zext <2 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { +; CHECK-LABEL: v2i64_i16_04812: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> + %z1 = zext <4 x i16> %s1 to <4 x i64> + ret <4 x i64> %z1 +} + +define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { +; CHECK-LABEL: v2i64_i16_15913: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> + %z1 = zext <4 x i16> %s1 to <4 x i64> + ret <4 x i64> %z1 +} + +define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { +; CHECK-LABEL: v2i64_i16_261014: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> + %z1 = zext <4 x i16> %s1 to <4 x i64> + ret <4 x i64> %z1 +} + +define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) { +; CHECK-LABEL: v2i64_i16_371115: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> + %z1 = zext <4 x i16> %s1 to <4 x i64> + ret <4 x i64> %z1 +} + + +define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_0246: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_1357: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_04812: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_15913: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_261014: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: v4i32_371115: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = zext <4 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + + +define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_0246: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_1357: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_04812: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_15913: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_261014: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i16_371115: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + + +define <8 x i64> @zext_add(<32 x i16> %l) { +; CHECK-LABEL: zext_add: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: adrp x9, .LCPI22_3 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: adrp x8, .LCPI22_1 +; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI22_3] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI22_1] +; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: adrp x9, .LCPI22_7 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_4 +; CHECK-NEXT: ldr q18, [x9, :lo12:.LCPI22_7] +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI22_4] +; CHECK-NEXT: adrp x8, .LCPI22_5 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v5.16b, { v0.16b, v1.16b }, v5.16b +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI22_5] +; CHECK-NEXT: adrp x8, .LCPI22_6 +; CHECK-NEXT: tbl v7.16b, { v0.16b, v1.16b }, v7.16b +; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI22_6] +; CHECK-NEXT: tbl v17.16b, { v0.16b, v1.16b }, v17.16b +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b +; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b +; CHECK-NEXT: tbl v4.16b, { v2.16b, v3.16b }, v6.16b +; CHECK-NEXT: tbl v6.16b, { v2.16b, v3.16b }, v16.16b +; CHECK-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v19.16b +; CHECK-NEXT: uaddl v5.4s, v5.4h, v7.4h +; CHECK-NEXT: uaddl v7.4s, v17.4h, v0.4h +; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v4.8h +; CHECK-NEXT: uaddl2 v2.4s, v6.8h, v2.8h +; CHECK-NEXT: uaddl v0.2d, v5.2s, v7.2s +; CHECK-NEXT: uaddl2 v1.2d, v5.4s, v7.4s +; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v2.4s +; CHECK-NEXT: uaddl v2.2d, v4.2s, v2.2s +; CHECK-NEXT: ret + %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z1 = zext <8 x i16> %s1 to <8 x i64> + %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z2 = zext <8 x i16> %s2 to <8 x i64> + %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z3 = zext <8 x i16> %s3 to <8 x i64> + %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z4 = zext <8 x i16> %s4 to <8 x i64> + %a = add <8 x i64> %z1, %z2 + %b = add <8 x i64> %z3, %z4 + %c = add <8 x i64> %a, %b + ret <8 x i64> %c +} + +define <8 x i64> @zext_load_add(ptr %p) { +; CHECK-LABEL: zext_load_add: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +; CHECK-NEXT: uaddl v4.4s, v0.4h, v1.4h +; CHECK-NEXT: uaddl v5.4s, v2.4h, v3.4h +; CHECK-NEXT: uaddl2 v6.4s, v0.8h, v1.8h +; CHECK-NEXT: uaddl2 v2.4s, v2.8h, v3.8h +; CHECK-NEXT: uaddl v0.2d, v4.2s, v5.2s +; CHECK-NEXT: uaddl2 v1.2d, v4.4s, v5.4s +; CHECK-NEXT: uaddl2 v3.2d, v6.4s, v2.4s +; CHECK-NEXT: uaddl v2.2d, v6.2s, v2.2s +; CHECK-NEXT: ret + %l = load <32 x i16>, ptr %p + %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z1 = zext <8 x i16> %s1 to <8 x i64> + %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z2 = zext <8 x i16> %s2 to <8 x i64> + %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z3 = zext <8 x i16> %s3 to <8 x i64> + %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z4 = zext <8 x i16> %s4 to <8 x i64> + %a = add <8 x i64> %z1, %z2 + %b = add <8 x i64> %z3, %z4 + %c = add <8 x i64> %a, %b + ret <8 x i64> %c +} + +define <8 x double> @uitofp_fadd(<32 x i16> %l) { +; CHECK-LABEL: uitofp_fadd: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: adrp x9, .LCPI24_1 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: adrp x10, .LCPI24_6 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI24_1] +; CHECK-NEXT: adrp x8, .LCPI24_2 +; CHECK-NEXT: adrp x9, .LCPI24_3 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: adrp x8, .LCPI24_4 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v4.16b, { v0.16b, v1.16b }, v4.16b +; CHECK-NEXT: tbl v5.16b, { v2.16b, v3.16b }, v5.16b +; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI24_3] +; CHECK-NEXT: adrp x9, .LCPI24_5 +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI24_4] +; CHECK-NEXT: adrp x8, .LCPI24_7 +; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI24_5] +; CHECK-NEXT: ldr q18, [x10, :lo12:.LCPI24_6] +; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI24_7] +; CHECK-NEXT: tbl v6.16b, { v0.16b, v1.16b }, v6.16b +; CHECK-NEXT: tbl v7.16b, { v2.16b, v3.16b }, v7.16b +; CHECK-NEXT: tbl v16.16b, { v0.16b, v1.16b }, v16.16b +; CHECK-NEXT: tbl v17.16b, { v2.16b, v3.16b }, v17.16b +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b +; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v19.16b +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 +; CHECK-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-NEXT: ushll v16.4s, v16.4h, #0 +; CHECK-NEXT: ushll2 v20.2d, v5.4s, #0 +; CHECK-NEXT: ushll2 v21.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-NEXT: ushll v2.2d, v5.2s, #0 +; CHECK-NEXT: ushll v3.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v7.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 +; CHECK-NEXT: ushll v7.2d, v7.2s, #0 +; CHECK-NEXT: ucvtf v18.2d, v20.2d +; CHECK-NEXT: ucvtf v19.2d, v21.2d +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll2 v20.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v21.2d, v16.4s, #0 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-NEXT: ushll v22.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v23.2d, v1.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d +; CHECK-NEXT: ucvtf v4.2d, v4.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ucvtf v6.2d, v6.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d +; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v16.2d, v16.2d +; CHECK-NEXT: ucvtf v22.2d, v22.2d +; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: fadd v4.2d, v18.2d, v4.2d +; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d +; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d +; CHECK-NEXT: fadd v6.2d, v16.2d, v22.2d +; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d +; CHECK-NEXT: fadd v7.2d, v17.2d, v1.2d +; CHECK-NEXT: fadd v1.2d, v21.2d, v0.2d +; CHECK-NEXT: fadd v0.2d, v3.2d, v6.2d +; CHECK-NEXT: fadd v3.2d, v4.2d, v16.2d +; CHECK-NEXT: fadd v1.2d, v5.2d, v1.2d +; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: ret + %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z1 = uitofp <8 x i16> %s1 to <8 x double> + %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z2 = uitofp <8 x i16> %s2 to <8 x double> + %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z3 = uitofp <8 x i16> %s3 to <8 x double> + %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z4 = uitofp <8 x i16> %s4 to <8 x double> + %a = fadd <8 x double> %z1, %z2 + %b = fadd <8 x double> %z3, %z4 + %c = fadd <8 x double> %a, %b + ret <8 x double> %c +} + +define <8 x double> @uitofp_load_fadd(ptr %p) { +; CHECK-LABEL: uitofp_load_fadd: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll v7.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0 +; CHECK-NEXT: ushll v17.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0 +; CHECK-NEXT: ushll v0.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0 +; CHECK-NEXT: ushll v3.2d, v4.2s, #0 +; CHECK-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v19.2d, v7.4s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v7.2d, v7.2s, #0 +; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v22.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v23.2d, v18.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ushll v18.2d, v18.2s, #0 +; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d +; CHECK-NEXT: ucvtf v4.2d, v4.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ucvtf v19.2d, v19.2d +; CHECK-NEXT: ucvtf v6.2d, v6.2d +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d +; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: ucvtf v16.2d, v16.2d +; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v22.2d, v22.2d +; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d +; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d +; CHECK-NEXT: fadd v4.2d, v4.2d, v7.2d +; CHECK-NEXT: fadd v6.2d, v3.2d, v6.2d +; CHECK-NEXT: fadd v2.2d, v2.2d, v19.2d +; CHECK-NEXT: fadd v3.2d, v17.2d, v22.2d +; CHECK-NEXT: fadd v5.2d, v16.2d, v18.2d +; CHECK-NEXT: fadd v7.2d, v21.2d, v0.2d +; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d +; CHECK-NEXT: fadd v0.2d, v4.2d, v3.2d +; CHECK-NEXT: fadd v3.2d, v1.2d, v16.2d +; CHECK-NEXT: fadd v1.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v2.2d, v6.2d, v5.2d +; CHECK-NEXT: ret + %l = load <32 x i16>, ptr %p + %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z1 = uitofp <8 x i16> %s1 to <8 x double> + %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z2 = uitofp <8 x i16> %s2 to <8 x double> + %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z3 = uitofp <8 x i16> %s3 to <8 x double> + %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> + %z4 = uitofp <8 x i16> %s4 to <8 x double> + %a = fadd <8 x double> %z1, %z2 + %b = fadd <8 x double> %z3, %z4 + %c = fadd <8 x double> %a, %b + ret <8 x double> %c +} + From 030e4d0cdf4c43a6ec1ca301b5a358991fa2ac4f Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 4 Sep 2024 10:38:18 +0200 Subject: [PATCH 049/425] [Clang] Treat default template argument as constant expressions (#107073) We only check that a default argument is a converted constant expression when using the default argument. However, when parsing a default argument, we need to make sure to parse it as a constant expression such as not ODR-use variables. (otherwise, we would try to capture default template arguments of generic lambdas) Fixes #107048 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Parse/ParseTemplate.cpp | 2 +- clang/test/SemaCXX/cxx2a-template-lambdas.cpp | 34 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 4128ca78ce396..251eb4c1c4559 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -346,6 +346,7 @@ Bug Fixes to C++ Support - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024) - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373) - Fixed a bug in the substitution of empty pack indexing types. (#GH105903) +- Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index 6ecfc15757f3d..de29652abbfd9 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -959,7 +959,7 @@ Parser::ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position) { ++CurTemplateDepthTracker; EnterExpressionEvaluationContext ConstantEvaluated( Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated); - DefaultArg = Actions.CorrectDelayedTyposInExpr(ParseInitializer()); + DefaultArg = Actions.ActOnConstantExpression(ParseInitializer()); if (DefaultArg.isInvalid()) SkipUntil(tok::comma, tok::greater, StopAtSemi | StopBeforeMatch); } diff --git a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp index fff524e77d3bf..00ba291fbd198 100644 --- a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp +++ b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp @@ -97,3 +97,37 @@ void foo() { } #endif + +#if __cplusplus >= 202002L +namespace { +struct S {}; +constexpr S gs; +void f() { + constexpr int x{}; + const int y{}; + auto b = []{}; + using A = decltype([]{}); + + int z; // expected-note {{'z' declared here}} + auto c = []{ + // expected-error@-1 {{no matching function for call to object of type}} \ + // expected-error@-1 {{variable 'z' cannot be implicitly captured in a lambda with no capture-default specified}} \ + // expected-note@-1 {{lambda expression begins here}} \ + // expected-note@-1 4{{capture}} \ + // expected-note@-1 {{candidate template ignored: substitution failure: reference to local variable 'z' declared in enclosing function}} + return t; + }(); + + auto class_type_global = []{}; + + static constexpr S static_s; + auto class_type_static = []{}; + + constexpr S s; // expected-note {{'s' declared here}} + auto class_type = []{}; + // expected-error@-1 {{variable 's' cannot be implicitly captured in a lambda with no capture-default specified}} \ + // expected-note@-1 {{lambda expression begins here}} \ + // expected-note@-1 4{{capture}} +} +} +#endif From 50febdeb64fce345b0fb669e9688d34c8ffe7912 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 4 Sep 2024 16:41:56 +0800 Subject: [PATCH 050/425] [mlir][vector] Bugfix of linearize `vector.extract` (#106836) This patch add check for `vector.extract` with scalar type, which is not allowed when linearize `vector.extract`. Fix #106162. --- .../Dialect/Vector/Transforms/VectorLinearize.cpp | 4 ++++ mlir/test/Dialect/Vector/linearize.mlir | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 868397f2daaae..11917ac1e4022 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -337,6 +337,10 @@ struct LinearizeVectorExtract final matchAndRewrite(vector::ExtractOp extractOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type dstTy = getTypeConverter()->convertType(extractOp.getType()); + if (!dstTy) + return rewriter.notifyMatchFailure(extractOp, + "expected n-D vector type."); + if (extractOp.getVector().getType().isScalable() || cast(dstTy).isScalable()) return rewriter.notifyMatchFailure(extractOp, diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir index 916e3e5fd2529..543e76b5b26e0 100644 --- a/mlir/test/Dialect/Vector/linearize.mlir +++ b/mlir/test/Dialect/Vector/linearize.mlir @@ -306,3 +306,15 @@ func.func @test_vector_insert_scalable(%arg0: vector<2x8x[4]xf32>, %arg1: vector // ALL: return %[[RES]] : vector<2x8x[4]xf32> return %0 : vector<2x8x[4]xf32> } + +// ----- + +// ALL-LABEL: test_vector_extract_scalar +func.func @test_vector_extract_scalar() { + %cst = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32> + // ALL-NOT: vector.shuffle + // ALL: vector.extract + // ALL-NOT: vector.shuffle + %0 = vector.extract %cst[0] : i32 from vector<4xi32> + return +} From 58f289612f1959256fa2228f013cfe96304b45c4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 02:09:21 -0700 Subject: [PATCH 051/425] [XRay] Use DenseMap::{operator[],try_emplace} (NFC) (#107178) I'm planning to deprecate DenseMap::FindAndConstruct in favor of operator[]. I'm using try_emplace because "Vertices[I.first];" on its own might look like a nop statement. --- llvm/include/llvm/XRay/Graph.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h index 953ac1aa69637..07b418b8cb640 100644 --- a/llvm/include/llvm/XRay/Graph.h +++ b/llvm/include/llvm/XRay/Graph.h @@ -378,20 +378,17 @@ class Graph { /// Looks up the vertex with identifier I, if it does not exist it default /// constructs it. - VertexAttribute &operator[](const VertexIdentifier &I) { - return Vertices.FindAndConstruct(I).second; - } + VertexAttribute &operator[](const VertexIdentifier &I) { return Vertices[I]; } /// Looks up the edge with identifier I, if it does not exist it default /// constructs it, if it's endpoints do not exist it also default constructs /// them. EdgeAttribute &operator[](const EdgeIdentifier &I) { - auto &P = Edges.FindAndConstruct(I); - Vertices.FindAndConstruct(I.first); - Vertices.FindAndConstruct(I.second); + Vertices.try_emplace(I.first); + Vertices.try_emplace(I.second); InNeighbors[I.second].insert(I.first); OutNeighbors[I.first].insert(I.second); - return P.second; + return Edges[I]; } /// Looks up a vertex with Identifier I, or an error if it does not exist. @@ -479,8 +476,8 @@ class Graph { auto EI = Val.first; const auto &p = Edges.insert(std::move(Val)); if (p.second) { - Vertices.FindAndConstruct(EI.first); - Vertices.FindAndConstruct(EI.second); + Vertices.try_emplace(EI.first); + Vertices.try_emplace(EI.second); InNeighbors[EI.second].insert(EI.first); OutNeighbors[EI.first].insert(EI.second); }; From 9e08db796b7fc1aa21ec0a0c16a0213229e02010 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 4 Sep 2024 10:16:14 +0100 Subject: [PATCH 052/425] [OpenMPIRBuilder] Don't drop debug info for target region. (#80692) When an outlined function is generated for omp target region, a corresponding DISubprogram was not being generated. This resulted in all the debug information for the target region being dropped. This commit adds DISubprogram for the outlined function if there is one available for the parent function. It also updates the current debug location so that the right scope is used for the entries in the outlined function. There are places in the OpenMPIRBuilder which changes insertion point but don't update the debug location accordingly. They cause issue when debug info is enabled. I have fixed a few that I observed to cause issue. But there may be more and a systematic cleanup may be required. With this change in place, I can set source line breakpoint in target region and run to them in debugger. --- .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 6 ++- .../Integration/debug-target-region-vars.f90 | 28 ++++++++++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 43 ++++++++++++++++--- .../Frontend/OpenMPIRBuilderTest.cpp | 4 ++ mlir/test/Target/LLVMIR/omptarget-debug.mlir | 29 +++++++++++++ mlir/test/Target/LLVMIR/omptarget-debug2.mlir | 31 +++++++++++++ 6 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 flang/test/Integration/debug-target-region-vars.f90 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug.mlir create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug2.mlir diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 46fc40b714aac..576e65ba6ecc5 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -309,7 +309,11 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, return; funcOp.walk([&](fir::cg::XDeclareOp declOp) { - handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable); + // FIXME: We currently dont handle variables that are not in the entry + // blocks of the fuctions. These may be variable or arguments used in the + // OpenMP target regions. + if (&funcOp.front() == declOp->getBlock()) + handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable); }); } diff --git a/flang/test/Integration/debug-target-region-vars.f90 b/flang/test/Integration/debug-target-region-vars.f90 new file mode 100644 index 0000000000000..a57afb301d9b7 --- /dev/null +++ b/flang/test/Integration/debug-target-region-vars.f90 @@ -0,0 +1,28 @@ +! RUN: %flang_fc1 -fopenmp -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +! Test that variables inside OpenMP target region don't cause build failure. +subroutine test1 + implicit none + real, allocatable :: xyz(:) + integer :: i + + !$omp target simd map(from:xyz) + do i = 1, size(xyz) + xyz(i) = 5.0 * xyz(i) + end do +end subroutine + +subroutine test2 (xyz) + integer :: i + integer :: xyz(:) + + !$omp target map(from:xyz) + !$omp do private(xyz) + do i = 1, 10 + xyz(i) = i + end do + !$omp end target +end subroutine + +!CHECK: DISubprogram(name: "test1"{{.*}}) +!CHECK: DISubprogram(name: "test2"{{.*}}) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 532313a31fc13..027b927fa6424 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -4328,6 +4329,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, // That's why make an unconditional branch from loop preheader to loop // exit block Builder.restoreIP({Preheader, Preheader->end()}); + Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc()); Preheader->getTerminator()->eraseFromParent(); Builder.CreateBr(CLI->getExit()); @@ -6584,13 +6586,45 @@ static Function *createOutlinedFunction( ParameterTypes.push_back(Arg->getType()); } + auto BB = Builder.GetInsertBlock(); + auto M = BB->getModule(); auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes, /*isVarArg*/ false); - auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, - Builder.GetInsertBlock()->getModule()); + auto Func = + Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M); // Save insert point. - auto OldInsertPoint = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); + // If there's a DISubprogram associated with current function, then + // generate one for the outlined function. + if (Function *ParentFunc = BB->getParent()) { + if (DISubprogram *SP = ParentFunc->getSubprogram()) { + DICompileUnit *CU = SP->getUnit(); + DIBuilder DB(*M, true, CU); + DebugLoc DL = Builder.getCurrentDebugLocation(); + if (DL) { + // TODO: We are using nullopt for arguments at the moment. This will + // need to be updated when debug data is being generated for variables. + DISubroutineType *Ty = + DB.createSubroutineType(DB.getOrCreateTypeArray(std::nullopt)); + DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition | + DISubprogram::SPFlagOptimized | + DISubprogram::SPFlagLocalToUnit; + + DISubprogram *OutlinedSP = DB.createFunction( + CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty, + DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags); + + // Attach subprogram to the function. + Func->setSubprogram(OutlinedSP); + // Update the CurrentDebugLocation in the builder so that right scope + // is used for things inside outlined function. + Builder.SetCurrentDebugLocation( + DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(), + OutlinedSP, DL.getInlinedAt())); + } + } + } // Generate the region into the function. BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func); @@ -6697,9 +6731,6 @@ static Function *createOutlinedFunction( for (auto Deferred : DeferredReplacement) ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func); - // Restore insert point. - Builder.restoreIP(OldInsertPoint); - return Func; } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 6207792f9f0d0..c92a3ff2e7ba6 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6006,6 +6006,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { BasicBlock *FallbackBlock = Branch->getSuccessor(0); Iter = FallbackBlock->rbegin(); CallInst *FCall = dyn_cast(&*(++Iter)); + // 'F' has a dummy DISubprogram which causes OutlinedFunc to also + // have a DISubprogram. In this case, the call to OutlinedFunc needs + // to have a debug loc, otherwise verifier will complain. + FCall->setDebugLoc(DL); EXPECT_NE(FCall, nullptr); // Check that the correct aguments are passed in diff --git a/mlir/test/Target/LLVMIR/omptarget-debug.mlir b/mlir/test/Target/LLVMIR/omptarget-debug.mlir new file mode 100644 index 0000000000000..76a853249caca --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-debug.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = true} { + llvm.func @_QQmain() { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%9 -> %arg0 : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr): + %13 = llvm.mlir.constant(1 : i32) : i32 + llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2) + omp.terminator + } + llvm.return + } loc(#loc3) +} +#file = #llvm.di_file<"target.f90" in ""> +#cu = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, + emissionKind = LineTablesOnly> +#sp_ty = #llvm.di_subroutine_type +#sp = #llvm.di_subprogram, compileUnit = #cu, scope = #file, + name = "_QQmain", file = #file, subprogramFlags = "Definition", type = #sp_ty> +#loc1 = loc("target.f90":1:1) +#loc2 = loc("target.f90":46:3) +#loc3 = loc(fused<#sp>[#loc1]) + +// CHECK-DAG: ![[SP:.*]] = {{.*}}!DISubprogram(name: "__omp_offloading_{{.*}}"{{.*}}) +// CHECK-DAG: !DILocation(line: 46, column: 3, scope: ![[SP]]) diff --git a/mlir/test/Target/LLVMIR/omptarget-debug2.mlir b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir new file mode 100644 index 0000000000000..ee19cc31e5c6b --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Same test as omptarget-debug.mlir but with is_target_device = false. +// Somehow test with omp.target don't work with -split-input-file. +module attributes {omp.is_target_device = false} { + llvm.func @_QQmain() { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%9 -> %arg0 : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr): + %13 = llvm.mlir.constant(1 : i32) : i32 + llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2) + omp.terminator + } + llvm.return + } loc(#loc3) +} +#file = #llvm.di_file<"target.f90" in ""> +#cu = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, + emissionKind = LineTablesOnly> +#sp_ty = #llvm.di_subroutine_type +#sp = #llvm.di_subprogram, compileUnit = #cu, scope = #file, + name = "_QQmain", file = #file, subprogramFlags = "Definition", type = #sp_ty> +#loc1 = loc("target.f90":1:1) +#loc2 = loc("target.f90":46:3) +#loc3 = loc(fused<#sp>[#loc1]) + +// CHECK-DAG: ![[SP:.*]] = {{.*}}!DISubprogram(name: "__omp_offloading_{{.*}}"{{.*}}) +// CHECK-DAG: !DILocation(line: 46, column: 3, scope: ![[SP]]) From 5914566474de29309b0b4815ecd406805793de1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Wed, 4 Sep 2024 11:24:52 +0200 Subject: [PATCH 053/425] [Utils][SPIR-V] Adding spirv-sim to LLVM (#107094) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 2nd submission The buildbots are using python 3.8, and some type annotations I was using are only available starting 3.9. The last commit on the pile is the additional changes compared to the original submission https://github.com/llvm/llvm-project/pull/104020. ### Original text: Currently, the testing infrastructure for SPIR-V is based on FileCheck. Those tests are great to check some level of codegen, but when the test needs check both the CFG layout and the content of each basic-block, things becomes messy. Because the CHECK/CHECK-DAG/CHECK-NEXT state is limited, it is sometimes hard to catch the good block: if 2 basic blocks have similar instructions, FileCheck can match the wrong one. Cross-lane interaction can be a bit difficult to understand, and writting a FileCheck test that is strong enough to catch bad CFG transforms while not being broken everytime some unrelated codegen part changes is hard. And lastly, the spirv-val tooling we have checks that the generated SPIR-V respects the spec, not that it is correct in regards to the source IR. For those reasons, I believe the best way to test the structurizer is to: run spirv-val to make sure the CFG respects the spec. simulate the function to validate result for each lane, making sure the generated code is correct. This simulator has no other dependencies than core python. It also only supports a very limited set of instructions as we can test most features through control-flow and some basic cross-lane interactions. As-is, the added tests are just a harness for the simulator itself. If this gets merged, the structurizer PR will benefit from this as I'll be able to add extensive testing using this. --------- Signed-off-by: Nathan Gauër --- llvm/test/Other/spirv-sim/branch.spv | 42 ++ llvm/test/Other/spirv-sim/call.spv | 36 + llvm/test/Other/spirv-sim/constant.spv | 36 + llvm/test/Other/spirv-sim/lit.local.cfg | 8 + llvm/test/Other/spirv-sim/loop.spv | 58 ++ .../Other/spirv-sim/simple-bad-result.spv | 26 + llvm/test/Other/spirv-sim/simple.spv | 22 + llvm/test/Other/spirv-sim/simulator-args.spv | 36 + llvm/test/Other/spirv-sim/switch.spv | 42 ++ .../Other/spirv-sim/wave-get-lane-index.spv | 30 + .../Other/spirv-sim/wave-read-lane-first.spv | 83 +++ llvm/test/lit.cfg.py | 2 +- llvm/utils/spirv-sim/instructions.py | 381 ++++++++++ llvm/utils/spirv-sim/spirv-sim.py | 658 ++++++++++++++++++ 14 files changed, 1459 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Other/spirv-sim/branch.spv create mode 100644 llvm/test/Other/spirv-sim/call.spv create mode 100644 llvm/test/Other/spirv-sim/constant.spv create mode 100644 llvm/test/Other/spirv-sim/lit.local.cfg create mode 100644 llvm/test/Other/spirv-sim/loop.spv create mode 100644 llvm/test/Other/spirv-sim/simple-bad-result.spv create mode 100644 llvm/test/Other/spirv-sim/simple.spv create mode 100644 llvm/test/Other/spirv-sim/simulator-args.spv create mode 100644 llvm/test/Other/spirv-sim/switch.spv create mode 100644 llvm/test/Other/spirv-sim/wave-get-lane-index.spv create mode 100644 llvm/test/Other/spirv-sim/wave-read-lane-first.spv create mode 100644 llvm/utils/spirv-sim/instructions.py create mode 100755 llvm/utils/spirv-sim/spirv-sim.py diff --git a/llvm/test/Other/spirv-sim/branch.spv b/llvm/test/Other/spirv-sim/branch.spv new file mode 100644 index 0000000000000..7ee0ebcad249d --- /dev/null +++ b/llvm/test/Other/spirv-sim/branch.spv @@ -0,0 +1,42 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=3 --expects=5,6,6 -i %s + OpCapability Shader + OpCapability GroupNonUniform + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %bool = OpTypeBool + %int_2 = OpConstant %int 2 + %int_5 = OpConstant %int 5 + %int_6 = OpConstant %int 6 + %uint_0 = OpConstant %uint 0 + %void = OpTypeVoid + %main_type = OpTypeFunction %void +%simple_type = OpTypeFunction %int + %uint_iptr = OpTypePointer Input %uint + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + %2 = OpLoad %uint %WaveIndex + %3 = OpIEqual %bool %uint_0 %2 + OpSelectionMerge %merge None + OpBranchConditional %3 %true %false + %true = OpLabel + OpBranch %merge + %false = OpLabel + OpBranch %merge + %merge = OpLabel + %4 = OpPhi %int %int_5 %true %int_6 %false + OpReturnValue %4 + OpFunctionEnd + diff --git a/llvm/test/Other/spirv-sim/call.spv b/llvm/test/Other/spirv-sim/call.spv new file mode 100644 index 0000000000000..320b048f95296 --- /dev/null +++ b/llvm/test/Other/spirv-sim/call.spv @@ -0,0 +1,36 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=1 --expects=2 -i %s + OpCapability Shader + OpCapability GroupNonUniform + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %uint_2 = OpConstant %uint 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void +%simple_type = OpTypeFunction %int + %sub_type = OpTypeFunction %uint + %uint_iptr = OpTypePointer Input %uint + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %sub = OpFunction %uint None %sub_type + %a = OpLabel + OpReturnValue %uint_2 + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + %2 = OpFunctionCall %uint %sub + %3 = OpBitcast %int %2 + OpReturnValue %3 + OpFunctionEnd + + diff --git a/llvm/test/Other/spirv-sim/constant.spv b/llvm/test/Other/spirv-sim/constant.spv new file mode 100644 index 0000000000000..1002427943a8d --- /dev/null +++ b/llvm/test/Other/spirv-sim/constant.spv @@ -0,0 +1,36 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=a --wave=1 --expects=2 -i %s +; RUN: spirv-sim --function=b --wave=1 --expects=1 -i %s + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %a "a" + OpName %b "b" + OpName %main "main" + %int = OpTypeInt 32 1 + %s1 = OpTypeStruct %int %int %int + %s2 = OpTypeStruct %s1 + %int_1 = OpConstant %int 1 + %int_2 = OpConstant %int 2 + %s1_1_2 = OpConstantComposite %s1 %int_1 %int_2 %int_1 + %s2_s1 = OpConstantComposite %s2 %s1_1_2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void + %simple_type = OpTypeFunction %int + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %a = OpFunction %int None %simple_type + %1 = OpLabel + %2 = OpCompositeExtract %int %s1_1_2 1 + OpReturnValue %2 + OpFunctionEnd + %b = OpFunction %int None %simple_type + %3 = OpLabel + %4 = OpCompositeExtract %int %s2_s1 0 2 + OpReturnValue %4 + OpFunctionEnd + diff --git a/llvm/test/Other/spirv-sim/lit.local.cfg b/llvm/test/Other/spirv-sim/lit.local.cfg new file mode 100644 index 0000000000000..67a8d9196f588 --- /dev/null +++ b/llvm/test/Other/spirv-sim/lit.local.cfg @@ -0,0 +1,8 @@ +spirv_sim_root = os.path.join(config.llvm_src_root, "utils", "spirv-sim") +config.substitutions.append( + ( + "spirv-sim", + "'%s' %s" + % (config.python_executable, os.path.join(spirv_sim_root, "spirv-sim.py")), + ) +) diff --git a/llvm/test/Other/spirv-sim/loop.spv b/llvm/test/Other/spirv-sim/loop.spv new file mode 100644 index 0000000000000..4fd0f1a7c96a3 --- /dev/null +++ b/llvm/test/Other/spirv-sim/loop.spv @@ -0,0 +1,58 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=4 --expects=0,2,2,4 -i %s + OpCapability Shader + OpCapability GroupNonUniform + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %bool = OpTypeBool + %int_2 = OpConstant %int 2 + %int_5 = OpConstant %int 5 + %int_6 = OpConstant %int 6 + %uint_0 = OpConstant %uint 0 + %uint_2 = OpConstant %uint 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void +%simple_type = OpTypeFunction %int + %uint_iptr = OpTypePointer Input %uint + %uint_fptr = OpTypePointer Function %uint + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %unused = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %entry = OpLabel +; uint i = 0; + %i = OpVariable %uint_fptr Function + %1 = OpLoad %uint %WaveIndex + OpStore %i %uint_0 + OpBranch %header + %header = OpLabel + %2 = OpLoad %uint %i + %3 = OpULessThan %bool %2 %1 + OpLoopMerge %merge %continue None + OpBranchConditional %3 %body %merge +; while (i < WaveGetLaneIndex()) { +; i += 2; +; } + %body = OpLabel + OpBranch %continue + %continue = OpLabel + %4 = OpIAdd %uint %2 %uint_2 + OpStore %i %4 + OpBranch %header + %merge = OpLabel +; return (int) i; + %5 = OpLoad %uint %i + %6 = OpBitcast %int %5 + OpReturnValue %6 + OpFunctionEnd + + diff --git a/llvm/test/Other/spirv-sim/simple-bad-result.spv b/llvm/test/Other/spirv-sim/simple-bad-result.spv new file mode 100644 index 0000000000000..f4dd046cc078b --- /dev/null +++ b/llvm/test/Other/spirv-sim/simple-bad-result.spv @@ -0,0 +1,26 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: not spirv-sim --function=simple --wave=1 --expects=1 -i %s 2>&1 | FileCheck %s + +; CHECK: Expected != Observed +; CHECK: [1] != [2] + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + %int = OpTypeInt 32 1 + %int_2 = OpConstant %int 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void + %simple_type = OpTypeFunction %int + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + OpReturnValue %int_2 + OpFunctionEnd + diff --git a/llvm/test/Other/spirv-sim/simple.spv b/llvm/test/Other/spirv-sim/simple.spv new file mode 100644 index 0000000000000..8c06192ea6e3d --- /dev/null +++ b/llvm/test/Other/spirv-sim/simple.spv @@ -0,0 +1,22 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=1 --expects=2 -i %s + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + %int = OpTypeInt 32 1 + %int_2 = OpConstant %int 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void + %simple_type = OpTypeFunction %int + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + OpReturnValue %int_2 + OpFunctionEnd diff --git a/llvm/test/Other/spirv-sim/simulator-args.spv b/llvm/test/Other/spirv-sim/simulator-args.spv new file mode 100644 index 0000000000000..d8b1018064158 --- /dev/null +++ b/llvm/test/Other/spirv-sim/simulator-args.spv @@ -0,0 +1,36 @@ +; RUN: not spirv-sim --function=simple --wave=a --expects=2 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-WAVE +; RUN: not spirv-sim --function=simple --wave=1 --expects=a -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-EXPECT +; RUN: not spirv-sim --function=simple --wave=1 --expects=1, -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-EXPECT +; RUN: not spirv-sim --function=simple --wave=2 --expects=1 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-SIZE +; RUN: not spirv-sim --function=foo --wave=1 --expects=1 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NAME + +; CHECK-WAVE: Invalid format for --wave/-w flag. + +; CHECK-EXPECT: Invalid format for --expects/-e flag. + +; CHECK-SIZE: Wave size != expected result array size + +; CHECK-NAME: 'foo' function not found. Known functions are: +; CHECK-NAME-NEXT: - main +; CHECK-NAME-NEXT: - simple +; CHECK-NANE-NOT-NEXT: - + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + %int = OpTypeInt 32 1 + %int_2 = OpConstant %int 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void + %simple_type = OpTypeFunction %int + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + OpReturnValue %int_2 + OpFunctionEnd diff --git a/llvm/test/Other/spirv-sim/switch.spv b/llvm/test/Other/spirv-sim/switch.spv new file mode 100644 index 0000000000000..83dc56cecef2a --- /dev/null +++ b/llvm/test/Other/spirv-sim/switch.spv @@ -0,0 +1,42 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,0 -i %s + OpCapability Shader + OpCapability GroupNonUniform + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %bool = OpTypeBool + %int_0 = OpConstant %int 0 + %int_1 = OpConstant %int 1 + %int_2 = OpConstant %int 2 + %uint_0 = OpConstant %uint 0 + %void = OpTypeVoid + %main_type = OpTypeFunction %void +%simple_type = OpTypeFunction %int + %uint_iptr = OpTypePointer Input %uint + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + %2 = OpLoad %uint %WaveIndex + OpSelectionMerge %merge None + OpSwitch %2 %default 1 %case_1 2 %case_2 + %default = OpLabel + OpBranch %merge + %case_1 = OpLabel + OpBranch %merge + %case_2 = OpLabel + OpBranch %merge + %merge = OpLabel + %4 = OpPhi %int %int_0 %default %int_1 %case_1 %int_2 %case_2 + OpReturnValue %4 + OpFunctionEnd diff --git a/llvm/test/Other/spirv-sim/wave-get-lane-index.spv b/llvm/test/Other/spirv-sim/wave-get-lane-index.spv new file mode 100644 index 0000000000000..1c1e5e8aefd4f --- /dev/null +++ b/llvm/test/Other/spirv-sim/wave-get-lane-index.spv @@ -0,0 +1,30 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,3 -i %s + OpCapability Shader + OpCapability GroupNonUniform + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %int_2 = OpConstant %int 2 + %void = OpTypeVoid + %main_type = OpTypeFunction %void +%simple_type = OpTypeFunction %int + %uint_iptr = OpTypePointer Input %uint + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %1 = OpLabel + %2 = OpLoad %uint %WaveIndex + %3 = OpBitcast %int %2 + OpReturnValue %3 + OpFunctionEnd + diff --git a/llvm/test/Other/spirv-sim/wave-read-lane-first.spv b/llvm/test/Other/spirv-sim/wave-read-lane-first.spv new file mode 100644 index 0000000000000..801fb55fbaa9f --- /dev/null +++ b/llvm/test/Other/spirv-sim/wave-read-lane-first.spv @@ -0,0 +1,83 @@ +; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %} +; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,0 -i %s + +; int simple() { +; int m[4] = { 0, 1, 2, 0 }; +; int idx = WaveGetLaneIndex(); +; for (int i = 0; i < 4; i++) { +; if (i == m[idx]) { +; return WaveReadLaneFirst(idx); +; } +; } +; return 0; +; } + OpCapability Shader + OpCapability GroupNonUniform + OpCapability GroupNonUniformBallot + OpMemoryModel Logical GLSL450 + OpEntryPoint GLCompute %main "main" %WaveIndex + OpExecutionMode %main LocalSize 1 1 1 + OpSource HLSL 670 + OpName %simple "simple" + OpName %main "main" + OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId + %int = OpTypeInt 32 1 + %uint = OpTypeInt 32 0 + %bool = OpTypeBool + %int_0 = OpConstant %int 0 + %int_1 = OpConstant %int 1 + %int_2 = OpConstant %int 2 + %int_4 = OpConstant %int 4 + %uint_3 = OpConstant %uint 3 + %uint_4 = OpConstant %uint 4 + %void = OpTypeVoid + %main_type = OpTypeFunction %void + %simple_type = OpTypeFunction %int + %uint_iptr = OpTypePointer Input %uint + %int_fptr = OpTypePointer Function %int + %arr_int_uint_4 = OpTypeArray %int %uint_4 +%arr_int_uint_4_fptr = OpTypePointer Function %arr_int_uint_4 + %WaveIndex = OpVariable %uint_iptr Input + %main = OpFunction %void None %main_type + %entry = OpLabel + OpReturn + OpFunctionEnd + %simple = OpFunction %int None %simple_type + %bb_entry_0 = OpLabel + %m = OpVariable %arr_int_uint_4_fptr Function + %idx = OpVariable %int_fptr Function + %i = OpVariable %int_fptr Function + %27 = OpCompositeConstruct %arr_int_uint_4 %int_0 %int_1 %int_2 %int_0 + OpStore %m %27 + %28 = OpLoad %uint %WaveIndex + %29 = OpBitcast %int %28 + OpStore %idx %29 + OpStore %i %int_0 + OpBranch %for_check + %for_check = OpLabel + %31 = OpLoad %int %i + %33 = OpSLessThan %bool %31 %int_4 + OpLoopMerge %for_merge %for_continue None + OpBranchConditional %33 %for_body %for_merge + %for_body = OpLabel + %37 = OpLoad %int %i + %38 = OpLoad %int %idx + %39 = OpAccessChain %int_fptr %m %38 + %40 = OpLoad %int %39 + %41 = OpIEqual %bool %37 %40 + OpSelectionMerge %if_merge None + OpBranchConditional %41 %if_true %if_merge + %if_true = OpLabel + %44 = OpLoad %int %idx + %45 = OpGroupNonUniformBroadcastFirst %int %uint_3 %44 + OpReturnValue %45 + %if_merge = OpLabel + OpBranch %for_continue + %for_continue = OpLabel + %47 = OpLoad %int %i + %48 = OpIAdd %int %47 %int_1 + OpStore %i %48 + OpBranch %for_check + %for_merge = OpLabel + OpReturnValue %int_0 + OpFunctionEnd diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index bee7aa3903a1f..1e0dd0a7df34f 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -22,7 +22,7 @@ # suffixes: A list of file extensions to treat as test files. This is overriden # by individual lit.local.cfg files in the test subdirectories. -config.suffixes = [".ll", ".c", ".test", ".txt", ".s", ".mir", ".yaml"] +config.suffixes = [".ll", ".c", ".test", ".txt", ".s", ".mir", ".yaml", ".spv"] # excludes: A list of directories to exclude from the testsuite. The 'Inputs' # subdirectories contain auxiliary inputs for various tests in their parent diff --git a/llvm/utils/spirv-sim/instructions.py b/llvm/utils/spirv-sim/instructions.py new file mode 100644 index 0000000000000..5e64a480a2be6 --- /dev/null +++ b/llvm/utils/spirv-sim/instructions.py @@ -0,0 +1,381 @@ +from typing import Optional, List + + +# Base class for an instruction. To implement a basic instruction that doesn't +# impact the control-flow, create a new class inheriting from this. +class Instruction: + # Contains the name of the output register, if any. + _result: Optional[str] + # Contains the instruction opcode. + _opcode: str + # Contains all the instruction operands, except result and opcode. + _operands: List[str] + + def __init__(self, line: str): + self.line = line + tokens = line.split() + if len(tokens) > 1 and tokens[1] == "=": + self._result = tokens[0] + self._opcode = tokens[2] + self._operands = tokens[3:] if len(tokens) > 2 else [] + else: + self._result = None + self._opcode = tokens[0] + self._operands = tokens[1:] if len(tokens) > 1 else [] + + def __str__(self): + if self._result is None: + return f" {self._opcode} {self._operands}" + return f"{self._result:3} = {self._opcode} {self._operands}" + + # Returns the instruction opcode. + def opcode(self) -> str: + return self._opcode + + # Returns the instruction operands. + def operands(self) -> List[str]: + return self._operands + + # Returns the instruction output register. Calling this function is + # only allowed if has_output_register() is true. + def output_register(self) -> str: + assert self._result is not None + return self._result + + # Returns true if this function has an output register. False otherwise. + def has_output_register(self) -> bool: + return self._result is not None + + # This function is used to initialize state related to this instruction + # before module execution begins. For example, global Input variables + # can use this to store the lane ID into the register. + def static_execution(self, lane): + pass + + # This function is called everytime this instruction is executed by a + # tangle. This function should not be directly overriden, instead see + # _impl and _advance_ip. + def runtime_execution(self, module, lane): + self._impl(module, lane) + self._advance_ip(module, lane) + + # This function needs to be overriden if your instruction can be executed. + # It implements the logic of the instruction. + # 'Static' instructions like OpConstant should not override this since + # they are not supposed to be executed at runtime. + def _impl(self, module, lane): + raise RuntimeError(f"Unimplemented instruction {self}") + + # By default, IP is incremented to point to the next instruction. + # If the instruction modifies IP (like OpBranch), this must be overridden. + def _advance_ip(self, module, lane): + lane.set_ip(lane.ip() + 1) + + +# Those are parsed, but never executed. +class OpEntryPoint(Instruction): + pass + + +class OpFunction(Instruction): + pass + + +class OpFunctionEnd(Instruction): + pass + + +class OpLabel(Instruction): + pass + + +class OpVariable(Instruction): + pass + + +class OpName(Instruction): + def name(self) -> str: + return self._operands[1][1:-1] + + def decoratedRegister(self) -> str: + return self._operands[0] + + +# The only decoration we use if the BuiltIn one to initialize the values. +class OpDecorate(Instruction): + def static_execution(self, lane): + if self._operands[1] == "LinkageAttributes": + return + + assert ( + self._operands[1] == "BuiltIn" + and self._operands[2] == "SubgroupLocalInvocationId" + ) + lane.set_register(self._operands[0], lane.tid()) + + +# Constants +class OpConstant(Instruction): + def static_execution(self, lane): + lane.set_register(self._result, int(self._operands[1])) + + +class OpConstantTrue(OpConstant): + def static_execution(self, lane): + lane.set_register(self._result, True) + + +class OpConstantFalse(OpConstant): + def static_execution(self, lane): + lane.set_register(self._result, False) + + +class OpConstantComposite(OpConstant): + def static_execution(self, lane): + result = [] + for op in self._operands[1:]: + result.append(lane.get_register(op)) + lane.set_register(self._result, result) + + +# Control flow instructions +class OpFunctionCall(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + entry = module.get_function_entry(self._operands[1]) + lane.do_call(entry, self._result) + + +class OpReturn(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + lane.do_return(None) + + +class OpReturnValue(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + lane.do_return(lane.get_register(self._operands[0])) + + +class OpBranch(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + lane.set_ip(module.get_bb_entry(self._operands[0])) + pass + + +class OpBranchConditional(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + condition = lane.get_register(self._operands[0]) + if condition: + lane.set_ip(module.get_bb_entry(self._operands[1])) + else: + lane.set_ip(module.get_bb_entry(self._operands[2])) + + +class OpSwitch(Instruction): + def _impl(self, module, lane): + pass + + def _advance_ip(self, module, lane): + value = lane.get_register(self._operands[0]) + default_label = self._operands[1] + i = 2 + while i < len(self._operands): + imm = int(self._operands[i]) + label = self._operands[i + 1] + if value == imm: + lane.set_ip(module.get_bb_entry(label)) + return + i += 2 + lane.set_ip(module.get_bb_entry(default_label)) + + +class OpUnreachable(Instruction): + def _impl(self, module, lane): + raise RuntimeError("This instruction should never be executed.") + + +# Convergence instructions +class MergeInstruction(Instruction): + def merge_location(self): + return self._operands[0] + + def continue_location(self): + return None if len(self._operands) < 3 else self._operands[1] + + def _impl(self, module, lane): + lane.handle_convergence_header(self) + + +class OpLoopMerge(MergeInstruction): + pass + + +class OpSelectionMerge(MergeInstruction): + pass + + +# Other instructions +class OpBitcast(Instruction): + def _impl(self, module, lane): + # TODO: find out the type from the defining instruction. + # This can only work for DXC. + if self._operands[0] == "%int": + lane.set_register(self._result, int(lane.get_register(self._operands[1]))) + else: + raise RuntimeError("Unsupported OpBitcast operand") + + +class OpAccessChain(Instruction): + def _impl(self, module, lane): + # Python dynamic types allows me to simplify. As long as the SPIR-V + # is legal, this should be fine. + # Note: SPIR-V structs are stored as tuples + value = lane.get_register(self._operands[1]) + for operand in self._operands[2:]: + value = value[lane.get_register(operand)] + lane.set_register(self._result, value) + + +class OpCompositeConstruct(Instruction): + def _impl(self, module, lane): + output = [] + for op in self._operands[1:]: + output.append(lane.get_register(op)) + lane.set_register(self._result, output) + + +class OpCompositeExtract(Instruction): + def _impl(self, module, lane): + value = lane.get_register(self._operands[1]) + output = value + for op in self._operands[2:]: + output = output[int(op)] + lane.set_register(self._result, output) + + +class OpStore(Instruction): + def _impl(self, module, lane): + lane.set_register(self._operands[0], lane.get_register(self._operands[1])) + + +class OpLoad(Instruction): + def _impl(self, module, lane): + lane.set_register(self._result, lane.get_register(self._operands[1])) + + +class OpIAdd(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS + RHS) + + +class OpISub(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS - RHS) + + +class OpIMul(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS * RHS) + + +class OpLogicalNot(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + lane.set_register(self._result, not LHS) + + +class _LessThan(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS < RHS) + + +class _GreaterThan(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS > RHS) + + +class OpSLessThan(_LessThan): + pass + + +class OpULessThan(_LessThan): + pass + + +class OpSGreaterThan(_GreaterThan): + pass + + +class OpUGreaterThan(_GreaterThan): + pass + + +class OpIEqual(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS == RHS) + + +class OpINotEqual(Instruction): + def _impl(self, module, lane): + LHS = lane.get_register(self._operands[1]) + RHS = lane.get_register(self._operands[2]) + lane.set_register(self._result, LHS != RHS) + + +class OpPhi(Instruction): + def _impl(self, module, lane): + previousBBName = lane.get_previous_bb_name() + i = 1 + while i < len(self._operands): + label = self._operands[i + 1] + if label == previousBBName: + lane.set_register(self._result, lane.get_register(self._operands[i])) + return + i += 2 + raise RuntimeError("previousBB not in the OpPhi _operands") + + +class OpSelect(Instruction): + def _impl(self, module, lane): + condition = lane.get_register(self._operands[1]) + value = lane.get_register(self._operands[2 if condition else 3]) + lane.set_register(self._result, value) + + +# Wave intrinsics +class OpGroupNonUniformBroadcastFirst(Instruction): + def _impl(self, module, lane): + assert lane.get_register(self._operands[1]) == 3 + if lane.is_first_active_lane(): + lane.broadcast_register(self._result, lane.get_register(self._operands[2])) + + +class OpGroupNonUniformElect(Instruction): + def _impl(self, module, lane): + lane.set_register(self._result, lane.is_first_active_lane()) diff --git a/llvm/utils/spirv-sim/spirv-sim.py b/llvm/utils/spirv-sim/spirv-sim.py new file mode 100755 index 0000000000000..428b0ca4eb796 --- /dev/null +++ b/llvm/utils/spirv-sim/spirv-sim.py @@ -0,0 +1,658 @@ +#!/usr/bin/env python3 + +from __future__ import annotations +from dataclasses import dataclass +from instructions import * +from typing import Any, Iterable, Callable, Optional, Tuple, List, Dict +import argparse +import fileinput +import inspect +import re +import sys + +RE_EXPECTS = re.compile(r"^([0-9]+,)*[0-9]+$") + + +# Parse the SPIR-V instructions. Some instructions are ignored because +# not required to simulate this module. +# Instructions are to be implemented in instructions.py +def parseInstruction(i): + IGNORED = set( + [ + "OpCapability", + "OpMemoryModel", + "OpExecutionMode", + "OpExtension", + "OpSource", + "OpTypeInt", + "OpTypeStruct", + "OpTypeFloat", + "OpTypeBool", + "OpTypeVoid", + "OpTypeFunction", + "OpTypePointer", + "OpTypeArray", + ] + ) + if i.opcode() in IGNORED: + return None + + try: + Type = getattr(sys.modules["instructions"], i.opcode()) + except AttributeError: + raise RuntimeError(f"Unsupported instruction {i}") + if not inspect.isclass(Type): + raise RuntimeError( + f"{i} instruction definition is not a class. Did you used 'def' instead of 'class'?" + ) + return Type(i.line) + + +# Split a list of instructions into pieces. Pieces are delimited by instructions of the type splitType. +# The delimiter is the first instruction of the next piece. +# This function returns no empty pieces: +# - if 2 subsequent delimiters will mean 2 pieces. One with only the first delimiter, and the second +# with the delimiter and following instructions. +# - if the first instruction is a delimiter, the first piece will begin with this delimiter. +def splitInstructions( + splitType: type, instructions: Iterable[Instruction] +) -> List[List[Instruction]]: + blocks: List[List[Instruction]] = [[]] + for instruction in instructions: + if isinstance(instruction, splitType) and len(blocks[-1]) > 0: + blocks.append([]) + blocks[-1].append(instruction) + return blocks + + +# Defines a BasicBlock in the simulator. +# Begins at an OpLabel, and ends with a control-flow instruction. +class BasicBlock: + def __init__(self, instructions) -> None: + assert isinstance(instructions[0], OpLabel) + # The name of the basic block, which is the register of the leading + # OpLabel. + self._name = instructions[0].output_register() + # The list of instructions belonging to this block. + self._instructions = instructions[1:] + + # Returns the name of this basic block. + def name(self): + return self._name + + # Returns the instruction at index in this basic block. + def __getitem__(self, index: int) -> Instruction: + return self._instructions[index] + + # Returns the number of instructions in this basic block, excluding the + # leading OpLabel. + def __len__(self): + return len(self._instructions) + + def dump(self): + print(f" {self._name}:") + for instruction in self._instructions: + print(f" {instruction}") + + +# Defines a Function in the simulator. +class Function: + def __init__(self, instructions) -> None: + assert isinstance(instructions[0], OpFunction) + # The name of the function (name of the register returned by OpFunction). + self._name: str = instructions[0].output_register() + # The list of basic blocks that belongs to this function. + self._basic_blocks: List[BasicBlock] = [] + # The variables local to this function. + self._variables: List[OpVariable] = [ + x for x in instructions if isinstance(x, OpVariable) + ] + + assert isinstance(instructions[-1], OpFunctionEnd) + body = filter(lambda x: not isinstance(x, OpVariable), instructions[1:-1]) + for block in splitInstructions(OpLabel, body): + self._basic_blocks.append(BasicBlock(block)) + + # Returns the name of this function. + def name(self) -> str: + return self._name + + # Returns the basic block at index in this function. + def __getitem__(self, index: int) -> BasicBlock: + return self._basic_blocks[index] + + # Returns the index of the basic block with the given name if found, + # -1 otherwise. + def get_bb_index(self, name) -> int: + for i in range(len(self._basic_blocks)): + if self._basic_blocks[i].name() == name: + return i + return -1 + + def dump(self): + print(" Variables:") + for var in self._variables: + print(f" {var}") + print(" Blocks:") + for bb in self._basic_blocks: + bb.dump() + + +# Represents an instruction pointer in the simulator. +@dataclass +class InstructionPointer: + # The current function the IP points to. + function: Function + # The basic block index in function IP points to. + basic_block: int + # The instruction in basic_block IP points to. + instruction_index: int + + def __str__(self): + bb = self.function[self.basic_block] + i = bb[self.instruction_index] + return f"{bb.name()}:{self.instruction_index} in {self.function.name()} | {i}" + + def __hash__(self): + return hash((self.function.name(), self.basic_block, self.instruction_index)) + + # Returns the basic block IP points to. + def bb(self) -> BasicBlock: + return self.function[self.basic_block] + + # Returns the instruction IP points to. + def instruction(self): + return self.function[self.basic_block][self.instruction_index] + + # Increment IP by 1. This only works inside a basic-block boundary. + # Incrementing IP when at the boundary of a basic block will fail. + def __add__(self, value: int): + bb = self.function[self.basic_block] + assert len(bb) > self.instruction_index + value + return InstructionPointer( + self.function, self.basic_block, self.instruction_index + value + ) + + +# Defines a Lane in this simulator. +class Lane: + # The registers known by this lane. + _registers: Dict[str, Any] + # The current IP of this lane. + _ip: Optional[InstructionPointer] + # If this lane running. + _running: bool + # The wave this lane belongs to. + _wave: Wave + # The callstack of this lane. Each tuple represents 1 call. + # The first element is the IP the function will return to. + # The second element is the callback to call to store the return value + # into the correct register. + _callstack: List[Tuple[InstructionPointer, Callable[[Any], None]]] + + _previous_bb: Optional[BasicBlock] + _current_bb: Optional[BasicBlock] + + def __init__(self, wave: Wave, tid: int) -> None: + self._registers = dict() + self._ip = None + self._running = True + self._wave = wave + self._callstack = [] + + # The index of this lane in the wave. + self._tid = tid + # The last BB this lane was executing into. + self._previous_bb = None + # The current BB this lane is executing into. + self._current_bb = None + + # Returns the lane/thread ID of this lane in its wave. + def tid(self) -> int: + return self._tid + + # Returns true is this lane if the first by index in the current active tangle. + def is_first_active_lane(self) -> bool: + return self._tid == self._wave.get_first_active_lane_index() + + # Broadcast value into the registers of all active lanes. + def broadcast_register(self, register: str, value: Any) -> None: + self._wave.broadcast_register(register, value) + + # Returns the IP this lane is currently at. + def ip(self) -> InstructionPointer: + assert self._ip is not None + return self._ip + + # Returns true if this lane is running, false otherwise. + # Running means not dead. An inactive lane is running. + def running(self) -> bool: + return self._running + + # Set the register at "name" to "value" in this lane. + def set_register(self, name: str, value: Any) -> None: + self._registers[name] = value + + # Get the value in register "name" in this lane. + # If allow_undef is true, fetching an unknown register won't fail. + def get_register(self, name: str, allow_undef: bool = False) -> Optional[Any]: + if allow_undef and name not in self._registers: + return None + return self._registers[name] + + def set_ip(self, ip: InstructionPointer) -> None: + if ip.bb() != self._current_bb: + self._previous_bb = self._current_bb + self._current_bb = ip.bb() + self._ip = ip + + def get_previous_bb_name(self): + return self._previous_bb.name() + + def handle_convergence_header(self, instruction): + self._wave.handle_convergence_header(self, instruction) + + def do_call(self, ip, output_register): + return_ip = None if self._ip is None else self._ip + 1 + self._callstack.append( + (return_ip, lambda value: self.set_register(output_register, value)) + ) + self.set_ip(ip) + + def do_return(self, value): + ip, callback = self._callstack[-1] + self._callstack.pop() + + callback(value) + if len(self._callstack) == 0: + self._running = False + else: + self.set_ip(ip) + + +# Represents the SPIR-V module in the simulator. +class Module: + _functions: Dict[str, Function] + _prolog: List[Instruction] + _globals: List[Instruction] + _name2reg: Dict[str, str] + _reg2name: Dict[str, str] + + def __init__(self, instructions) -> None: + chunks = splitInstructions(OpFunction, instructions) + + # The instructions located outside of all functions. + self._prolog = chunks[0] + # The functions in this module. + self._functions = {} + # Global variables in this module. + self._globals = [ + x + for x in instructions + if isinstance(x, OpVariable) or issubclass(type(x), OpConstant) + ] + + # Helper dictionaries to get real names of registers, or registers by names. + self._name2reg = {} + self._reg2name = {} + for instruction in instructions: + if isinstance(instruction, OpName): + name = instruction.name() + reg = instruction.decoratedRegister() + self._name2reg[name] = reg + self._reg2name[reg] = name + + for chunk in chunks[1:]: + function = Function(chunk) + assert function.name() not in self._functions + self._functions[function.name()] = function + + # Returns the register matching "name" if any, None otherwise. + # This assumes names are unique. + def getRegisterFromName(self, name): + if name in self._name2reg: + return self._name2reg[name] + return None + + # Returns the name given to "register" if any, None otherwise. + def getNameFromRegister(self, register): + if register in self._reg2name: + return self._reg2name[register] + return None + + # Initialize the module before wave execution begins. + # See Instruction::static_execution for more details. + def initialize(self, lane): + for instruction in self._globals: + instruction.static_execution(lane) + + # Initialize builtins + for instruction in self._prolog: + if isinstance(instruction, OpDecorate): + instruction.static_execution(lane) + + def execute_one_instruction(self, lane: Lane, ip: InstructionPointer) -> None: + ip.instruction().runtime_execution(self, lane) + + # Returns the first valid IP for the function defined by the given register. + # Calling this with a register not returned by OpFunction is illegal. + def get_function_entry(self, register: str) -> InstructionPointer: + if register not in self._functions: + raise RuntimeError(f"Function defining {register} not found.") + return InstructionPointer(self._functions[register], 0, 0) + + # Returns the first valid IP for the basic block defined by register. + # Calling this with a register not returned by an OpLabel is illegal. + def get_bb_entry(self, register: str) -> InstructionPointer: + for name, function in self._functions.items(): + index = function.get_bb_index(register) + if index != -1: + return InstructionPointer(function, index, 0) + raise RuntimeError(f"Instruction defining {register} not found.") + + # Returns the list of function names in this module. + # If an OpName exists for this function, returns the pretty name, else + # returns the register name. + def get_function_names(self): + return [self.getNameFromRegister(reg) for reg, func in self._functions.items()] + + # Returns the global variables defined in this module. + def variables(self) -> Iterable: + return [x.output_register() for x in self._globals] + + def dump(self, function_name: Optional[str] = None): + print("Module:") + print(" globals:") + for instruction in self._globals: + print(f" {instruction}") + + if function_name is None: + print(" functions:") + for register, function in self._functions.items(): + name = self.getNameFromRegister(register) + print(f" Function {register} ({name})") + function.dump() + return + + register = self.getRegisterFromName(function_name) + print(f" function {register} ({function_name}):") + if register is not None: + self._functions[register].dump() + else: + print(f" error: cannot find function.") + + +# Defines a convergence requirement for the simulation: +# A list of lanes impacted by a merge and possibly the associated +# continue target. +@dataclass +class ConvergenceRequirement: + mergeTarget: InstructionPointer + continueTarget: Optional[InstructionPointer] + impactedLanes: set[int] + + +Task = Dict[InstructionPointer, List[Lane]] + + +# Defines a Lane group/Wave in the simulator. +class Wave: + # The module this wave will execute. + _module: Module + # The lanes this wave will be composed of. + _lanes: List[Lane] + # The instructions scheduled for execution. + _tasks: Task + # The actual requirements to comply with when executing instructions. + # E.g: the set of lanes required to merge before executing the merge block. + _convergence_requirements: List[ConvergenceRequirement] + # The indices of the active lanes for the current executing instruction. + _active_lane_indices: set[int] + + def __init__(self, module, wave_size: int) -> None: + assert wave_size > 0 + self._module = module + self._lanes = [] + + for i in range(wave_size): + self._lanes.append(Lane(self, i)) + + self._tasks = {} + self._convergence_requirements = [] + # The indices of the active lanes for the current executing instruction. + self._active_lane_indices = set() + + # Returns True if the given IP can be executed for the given list of lanes. + def _is_task_candidate(self, ip: InstructionPointer, lanes: List[Lane]): + merged_lanes: set[int] = set() + for lane in self._lanes: + if not lane.running(): + merged_lanes.add(lane.tid()) + + for requirement in self._convergence_requirements: + # This task is not executing a merge or continue target. + # Adding all lanes at those points into the ignore list. + if requirement.mergeTarget != ip and requirement.continueTarget != ip: + for tid in requirement.impactedLanes: + if self._lanes[tid].ip() == requirement.mergeTarget: + merged_lanes.add(tid) + if self._lanes[tid].ip() == requirement.continueTarget: + merged_lanes.add(tid) + continue + + # This task is executing the current requirement continue/merge + # target. + for tid in requirement.impactedLanes: + lane = self._lanes[tid] + if not lane.running(): + continue + + if lane.tid() in merged_lanes: + continue + + if ip == requirement.mergeTarget: + if lane.ip() != requirement.mergeTarget: + return False + else: + if ( + lane.ip() != requirement.mergeTarget + and lane.ip() != requirement.continueTarget + ): + return False + return True + + # Returns the next task we can schedule. This must always return a task. + # Calling this when all lanes are dead is invalid. + def _get_next_runnable_task(self) -> Tuple[InstructionPointer, List[Lane]]: + candidate = None + for ip, lanes in self._tasks.items(): + if len(lanes) == 0: + continue + if self._is_task_candidate(ip, lanes): + candidate = ip + break + + if candidate: + lanes = self._tasks[candidate] + del self._tasks[ip] + return (candidate, lanes) + raise RuntimeError("No task to execute. Deadlock?") + + # Handle an encountered merge instruction for the given lane. + def handle_convergence_header(self, lane: Lane, instruction: MergeInstruction): + mergeTarget = self._module.get_bb_entry(instruction.merge_location()) + for requirement in self._convergence_requirements: + if requirement.mergeTarget == mergeTarget: + requirement.impactedLanes.add(lane.tid()) + return + + continueTarget = None + if instruction.continue_location(): + continueTarget = self._module.get_bb_entry(instruction.continue_location()) + requirement = ConvergenceRequirement( + mergeTarget, continueTarget, set([lane.tid()]) + ) + self._convergence_requirements.append(requirement) + + # Returns true if some instructions are scheduled for execution. + def _has_tasks(self) -> bool: + return len(self._tasks) > 0 + + # Returns the index of the first active lane right now. + def get_first_active_lane_index(self) -> int: + return min(self._active_lane_indices) + + # Broadcast the given value to all active lane registers. + def broadcast_register(self, register: str, value: Any) -> None: + for tid in self._active_lane_indices: + self._lanes[tid].set_register(register, value) + + # Returns the entrypoint of the function associated with 'name'. + # Calling this function with an invalid name is illegal. + def _get_function_entry_from_name(self, name: str) -> InstructionPointer: + register = self._module.getRegisterFromName(name) + assert register is not None + return self._module.get_function_entry(register) + + # Run the wave on the function 'function_name' until all lanes are dead. + # If verbose is True, execution trace is printed. + # Returns the value returned by the function for each lane. + def run(self, function_name: str, verbose: bool = False) -> List[Any]: + for t in self._lanes: + self._module.initialize(t) + + entry_ip = self._get_function_entry_from_name(function_name) + assert entry_ip is not None + for t in self._lanes: + t.do_call(entry_ip, "__shader_output__") + + self._tasks[self._lanes[0].ip()] = self._lanes + while self._has_tasks(): + ip, lanes = self._get_next_runnable_task() + self._active_lane_indices = set([x.tid() for x in lanes]) + if verbose: + print( + f"Executing with lanes {self._active_lane_indices}: {ip.instruction()}" + ) + + for lane in lanes: + self._module.execute_one_instruction(lane, ip) + if not lane.running(): + continue + + if lane.ip() in self._tasks: + self._tasks[lane.ip()].append(lane) + else: + self._tasks[lane.ip()] = [lane] + + if verbose and ip.instruction().has_output_register(): + register = ip.instruction().output_register() + print( + f" {register:3} = {[ x.get_register(register, allow_undef=True) for x in lanes ]}" + ) + + output = [] + for lane in self._lanes: + output.append(lane.get_register("__shader_output__")) + return output + + def dump_register(self, register: str) -> None: + for lane in self._lanes: + print( + f" Lane {lane.tid():2} | {register:3} = {lane.get_register(register)}" + ) + + +parser = argparse.ArgumentParser( + description="simulator", formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument( + "-i", "--input", help="Text SPIR-V to read from", required=False, default="-" +) +parser.add_argument("-f", "--function", help="Function to execute") +parser.add_argument("-w", "--wave", help="Wave size", default=32, required=False) +parser.add_argument( + "-e", + "--expects", + help="Expected results per lanes, expects a list of values. Ex: '1, 2, 3'.", +) +parser.add_argument("-v", "--verbose", help="verbose", action="store_true") +args = parser.parse_args() + + +def load_instructions(filename: str): + if filename is None: + return [] + + if filename.strip() != "-": + try: + with open(filename, "r") as f: + lines = f.read().split("\n") + except Exception: # (FileNotFoundError, PermissionError): + return [] + else: + lines = sys.stdin.readlines() + + # Remove leading/trailing whitespaces. + lines = [x.strip() for x in lines] + # Strip comments. + lines = [x for x in filter(lambda x: len(x) != 0 and x[0] != ";", lines)] + + instructions = [] + for i in [Instruction(x) for x in lines]: + out = parseInstruction(i) + if out != None: + instructions.append(out) + return instructions + + +def main(): + if args.expects is None or not RE_EXPECTS.match(args.expects): + print("Invalid format for --expects/-e flag.", file=sys.stderr) + sys.exit(1) + if args.function is None: + print("Invalid format for --function/-f flag.", file=sys.stderr) + sys.exit(1) + try: + int(args.wave) + except ValueError: + print("Invalid format for --wave/-w flag.", file=sys.stderr) + sys.exit(1) + + expected_results = [int(x.strip()) for x in args.expects.split(",")] + wave_size = int(args.wave) + if len(expected_results) != wave_size: + print("Wave size != expected result array size", file=sys.stderr) + sys.exit(1) + + instructions = load_instructions(args.input) + if len(instructions) == 0: + print("Invalid input. Expected a text SPIR-V module.") + sys.exit(1) + + module = Module(instructions) + if args.verbose: + module.dump() + module.dump(args.function) + + function_names = module.get_function_names() + if args.function not in function_names: + print( + f"'{args.function}' function not found. Known functions are:", + file=sys.stderr, + ) + for name in function_names: + print(f" - {name}", file=sys.stderr) + sys.exit(1) + + wave = Wave(module, wave_size) + results = wave.run(args.function, verbose=args.verbose) + + if expected_results != results: + print("Expected != Observed", file=sys.stderr) + print(f"{expected_results} != {results}", file=sys.stderr) + sys.exit(1) + sys.exit(0) + + +main() From afb6dafc6b680fb204d40c7fee4b339aa8471010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Wed, 4 Sep 2024 11:27:03 +0200 Subject: [PATCH 054/425] [clang][HLSL] Add WaveIsFirstLane() intrinsic (#103299) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commits add the WaveIsFirstLane() hlsl intrinsinc. This intrinsic uses the convergence intrinsincs for the SPIR-V backend. On the DXIL side, I'm not sure what the strategy is for convergence, so I implemented that like in DXC: a normal builtin function. Signed-off-by: Nathan Gauër --- clang/include/clang/Basic/Builtins.td | 6 +++ clang/lib/CodeGen/CGBuiltin.cpp | 4 ++ clang/lib/CodeGen/CGHLSLRuntime.h | 1 + clang/lib/Headers/hlsl/hlsl_intrinsics.h | 4 ++ .../builtins/wave_is_first_lane.hlsl | 34 ++++++++++++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 2 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 + llvm/lib/Target/DirectX/DXIL.td | 9 ++++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 +++ .../SPIRV/SPIRVStripConvergentIntrinsics.cpp | 53 +++++++++++-------- .../CodeGen/DirectX/wave_is_first_lane.ll | 13 +++++ .../SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll | 27 ++++++++++ 12 files changed, 140 insertions(+), 22 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl create mode 100644 llvm/test/CodeGen/DirectX/wave_is_first_lane.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 8668b25661dec..9e2a590f265ac 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4679,6 +4679,12 @@ def HLSLWaveGetLaneIndex : LangBuiltin<"HLSL_LANG"> { let Prototype = "unsigned int()"; } +def HLSLWaveIsFirstLane : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_wave_is_first_lane"]; + let Attributes = [NoThrow, Const]; + let Prototype = "bool()"; +} + def HLSLClamp : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_elementwise_clamp"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 786c2c224b349..012b43b8770be 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18723,6 +18723,10 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { llvm::FunctionType::get(IntTy, {}, false), "__hlsl_wave_get_lane_index", {}, false, true)); } + case Builtin::BI__builtin_hlsl_wave_is_first_lane: { + Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic(); + return EmitRuntimeCall(Intrinsic::getDeclaration(&CGM.getModule(), ID)); + } } return nullptr; } diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 55a4b97c160cd..40dcd74b7dd24 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -84,6 +84,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot) GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) + GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane) //===----------------------------------------------------------------------===// // End of reserved area for HLSL intrinsic getters. diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 6d38b668fe770..5c08a45a35377 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -1796,5 +1796,9 @@ _HLSL_AVAILABILITY(shadermodel, 6.0) _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_get_lane_index) __attribute__((convergent)) uint WaveGetLaneIndex(); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_is_first_lane) +__attribute__((convergent)) bool WaveIsFirstLane(); + } // namespace hlsl #endif //_HLSL_HLSL_INTRINSICS_H_ diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl new file mode 100644 index 0000000000000..18860c321eb91 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL + +[numthreads(1, 1, 1)] +void main() { +// CHECK-SPIRV: %[[#entry_tok:]] = call token @llvm.experimental.convergence.entry() + +// CHECK-SPIRV: %[[#loop_tok:]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %[[#entry_tok]]) ] + while (true) { + +// CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ] + if (WaveIsFirstLane()) { + break; + } + } + +// CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ] + if (WaveIsFirstLane()) { + return; + } +} + +// CHECK-DXIL: i1 @llvm.dx.wave.is.first.lane() #[[#attr:]] +// CHECK-SPIRV: i1 @llvm.spv.wave.is.first.lane() #[[#attr:]] + +// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 32af50b25f390..5f48e2ba939f5 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -79,4 +79,6 @@ def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLV def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>; def int_dx_rcp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_dx_rsqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + +def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; } diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 63d9ba43a1183..cbf6e04f2844d 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -79,4 +79,5 @@ let TargetPrefix = "spv" in { DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], [IntrNoMem, Commutative] >; + def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; } diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 83ea36ca048ad..4e3ecf4300d82 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -746,3 +746,12 @@ def CreateHandleFromBinding : DXILOp<218, createHandleFromBinding> { let result = HandleTy; let stages = [Stages]; } + +def WaveIsFirstLane : DXILOp<110, waveIsFirstLane> { + let Doc = "returns 1 for the first lane in the wave"; + let LLVMIntrinsic = int_dx_wave_is_first_lane; + let arguments = []; + let result = Int1Ty; + let stages = [Stages]; + let attributes = [Attributes]; +} diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 9e10d947081cc..fed82b904af4f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -2351,6 +2351,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, } break; case Intrinsic::spv_saturate: return selectSaturate(ResVReg, ResType, I); + case Intrinsic::spv_wave_is_first_lane: { + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + return BuildMI(BB, I, I.getDebugLoc(), + TII.get(SPIRV::OpGroupNonUniformElect)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII)); + } default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index dca30535acfa1..b632d78497767 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -41,31 +41,40 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass { virtual bool runOnFunction(Function &F) override { DenseSet ToRemove; + // Is the instruction is a convergent intrinsic, add it to kill-list and + // returns true. Returns false otherwise. + auto CleanupIntrinsic = [&](IntrinsicInst *II) { + if (II->getIntrinsicID() != Intrinsic::experimental_convergence_entry && + II->getIntrinsicID() != Intrinsic::experimental_convergence_loop && + II->getIntrinsicID() != Intrinsic::experimental_convergence_anchor) + return false; + + II->replaceAllUsesWith(UndefValue::get(II->getType())); + ToRemove.insert(II); + return true; + }; + + // Replace the given CallInst by a similar CallInst with no convergencectrl + // attribute. + auto CleanupCall = [&](CallInst *CI) { + auto OB = CI->getOperandBundle(LLVMContext::OB_convergencectrl); + if (!OB.has_value()) + return; + + auto *NewCall = CallBase::removeOperandBundle( + CI, LLVMContext::OB_convergencectrl, CI); + NewCall->copyMetadata(*CI); + CI->replaceAllUsesWith(NewCall); + ToRemove.insert(CI); + }; + for (BasicBlock &BB : F) { for (Instruction &I : BB) { - if (auto *II = dyn_cast(&I)) { - if (II->getIntrinsicID() != - Intrinsic::experimental_convergence_entry && - II->getIntrinsicID() != - Intrinsic::experimental_convergence_loop && - II->getIntrinsicID() != - Intrinsic::experimental_convergence_anchor) { + if (auto *II = dyn_cast(&I)) + if (CleanupIntrinsic(II)) continue; - } - - II->replaceAllUsesWith(UndefValue::get(II->getType())); - ToRemove.insert(II); - } else if (auto *CI = dyn_cast(&I)) { - auto OB = CI->getOperandBundle(LLVMContext::OB_convergencectrl); - if (!OB.has_value()) - continue; - - auto *NewCall = CallBase::removeOperandBundle( - CI, LLVMContext::OB_convergencectrl, CI); - NewCall->copyMetadata(*CI); - CI->replaceAllUsesWith(NewCall); - ToRemove.insert(CI); - } + if (auto *CI = dyn_cast(&I)) + CleanupCall(CI); } } diff --git a/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll b/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll new file mode 100644 index 0000000000000..2265dd8f7348c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll @@ -0,0 +1,13 @@ +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +define void @main() #0 { +entry: +; CHECK: call i1 @dx.op.waveIsFirstLane(i32 110) + %0 = call i1 @llvm.dx.wave.is.first.lane() + ret void +} + +declare i1 @llvm.dx.wave.is.first.lane() #1 + +attributes #0 = { convergent norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll new file mode 100644 index 0000000000000..94597b37cc7eb --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll @@ -0,0 +1,27 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv-unknown-vulkan-compute" + +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#uint_3:]] = OpConstant %[[#uint]] 3 +; CHECK-DAG: %[[#bool:]] = OpTypeBool + +define spir_func void @main() #0 { +entry: + %0 = call token @llvm.experimental.convergence.entry() +; CHECK: %[[#]] = OpGroupNonUniformElect %[[#bool]] %[[#uint_3]] + %1 = call i1 @llvm.spv.wave.is.first.lane() [ "convergencectrl"(token %0) ] + ret void +} + +declare i32 @__hlsl_wave_get_lane_index() #1 + +attributes #0 = { convergent norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #1 = { convergent } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 4, !"dx.disable_optimizations", i32 1} From cc5c526c80a4cacf7ed5b7fbe50072594ec1aeaf Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 4 Sep 2024 11:30:58 +0200 Subject: [PATCH 055/425] [lldb] Fix and speedup the `memory find` command (#104193) This patch fixes an issue where the `memory find` command would effectively stop searching after encountering a memory read error (which could happen due to unreadable memory), without giving any indication that it has done so (it would just print it could not find the pattern). To make matters worse, it would not terminate after encountering this error, but rather proceed to slowly increment the address pointer, which meant that searching a large region could take a very long time (and give the appearance that lldb is actually searching for the thing). The patch fixes this first problem by detecting read errors and skipping over (using GetMemoryRegionInfo) the unreadable parts of memory and resuming the search after them. It also reads the memory in bulk (`max(sizeof(pattern))`), which speeds up the search significantly (up to 6x for live processes, 18x for core files). --- lldb/source/Target/Process.cpp | 69 ++++++++++--------- .../memory/holes/TestMemoryHoles.py | 13 ++++ 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index ae64f6f261bad..6c5c5162e2468 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -114,33 +114,6 @@ class ProcessOptionValueProperties } }; -class ProcessMemoryIterator { -public: - ProcessMemoryIterator(Process &process, lldb::addr_t base) - : m_process(process), m_base_addr(base) {} - - bool IsValid() { return m_is_valid; } - - uint8_t operator[](lldb::addr_t offset) { - if (!IsValid()) - return 0; - - uint8_t retval = 0; - Status error; - if (0 == m_process.ReadMemory(m_base_addr + offset, &retval, 1, error)) { - m_is_valid = false; - return 0; - } - - return retval; - } - -private: - Process &m_process; - const lldb::addr_t m_base_addr; - bool m_is_valid = true; -}; - static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = { { eFollowParent, @@ -3379,21 +3352,49 @@ lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high, if (region_size < size) return LLDB_INVALID_ADDRESS; + // See "Boyer-Moore string search algorithm". std::vector bad_char_heuristic(256, size); - ProcessMemoryIterator iterator(*this, low); - for (size_t idx = 0; idx < size - 1; idx++) { decltype(bad_char_heuristic)::size_type bcu_idx = buf[idx]; bad_char_heuristic[bcu_idx] = size - idx - 1; } - for (size_t s = 0; s <= (region_size - size);) { + + // Memory we're currently searching through. + llvm::SmallVector mem; + // Position of the memory buffer. + addr_t mem_pos = low; + // Maximum number of bytes read (and buffered). We need to read at least + // `size` bytes for a successful match. + const size_t max_read_size = std::max(size, 0x10000); + + for (addr_t cur_addr = low; cur_addr <= (high - size);) { + if (cur_addr + size > mem_pos + mem.size()) { + // We need to read more data. We don't attempt to reuse the data we've + // already read (up to `size-1` bytes from `cur_addr` to + // `mem_pos+mem.size()`). This is fine for patterns much smaller than + // max_read_size. For very + // long patterns we may need to do something more elaborate. + mem.resize_for_overwrite(max_read_size); + Status error; + mem.resize(ReadMemory(cur_addr, mem.data(), + std::min(mem.size(), high - cur_addr), error)); + mem_pos = cur_addr; + if (size > mem.size()) { + // We didn't read enough data. Skip to the next memory region. + MemoryRegionInfo info; + error = GetMemoryRegionInfo(mem_pos + mem.size(), info); + if (error.Fail()) + break; + cur_addr = info.GetRange().GetRangeEnd(); + continue; + } + } int64_t j = size - 1; - while (j >= 0 && buf[j] == iterator[s + j]) + while (j >= 0 && buf[j] == mem[cur_addr + j - mem_pos]) j--; if (j < 0) - return low + s; - else - s += bad_char_heuristic[iterator[s + size - 1]]; + return cur_addr; // We have a match. + cur_addr += bad_char_heuristic[mem[cur_addr + size - 1 - mem_pos]]; } return LLDB_INVALID_ADDRESS; diff --git a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py index 1c2c90d483ea3..c61ae15b9dda7 100644 --- a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py +++ b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py @@ -43,6 +43,9 @@ def _prepare_inferior(self): # inside the holes we've deliberately left empty. self.memory = self.frame().FindVariable("mem_with_holes").GetValueAsUnsigned() self.pagesize = self.frame().FindVariable("pagesize").GetValueAsUnsigned() + self.num_pages = ( + self.target().FindFirstGlobalVariable("num_pages").GetValueAsUnsigned() + ) positions = self.frame().FindVariable("positions") self.positions = [ positions.GetChildAtIndex(i).GetValueAsUnsigned() @@ -58,3 +61,13 @@ def test_memory_read(self): self.assertEqual(len(content), self.pagesize) self.assertEqual(content[0:7], b"needle\0") self.assertTrue(error.Fail()) + + def test_memory_find(self): + self._prepare_inferior() + + matches = [f"data found at location: {p:#x}" for p in self.positions] + self.expect( + f'memory find --count {len(self.positions)+1} --string "needle" ' + f"{self.memory:#x} {self.memory+self.pagesize*self.num_pages:#x}", + substrs=matches + ["no more matches within the range"], + ) From 0b2550f8ab77a53f560f6a7a1b401c4803a36d48 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 4 Sep 2024 09:33:35 +0000 Subject: [PATCH 056/425] [lldb][test] Disable TestMultipleDebuggers on Linux This used to timeout (https://github.com/llvm/llvm-project/issues/101162) now it's aborting (https://github.com/llvm/llvm-project/pull/105765#issuecomment-2327645665) --- lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py index bee6e66aa7219..1fd4806cd74f4 100644 --- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py +++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py @@ -13,6 +13,7 @@ class TestMultipleSimultaneousDebuggers(TestBase): NO_DEBUG_INFO_TESTCASE = True # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162. + @skipIfLinux @skipIfNoSBHeaders @skipIfWindows @skipIfHostIncompatibleWithTarget From 69657eb7f67600ec1c5c449d13eef3670dfb64da Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 4 Sep 2024 17:37:34 +0800 Subject: [PATCH 057/425] [llc] Provide `opt` like verifier options (#106665) - Support `verify-each` option. - Default behavior is verifying output only. --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 +++ llvm/test/CodeGen/AArch64/PHIElimination-crash.mir | 2 +- .../test/CodeGen/AArch64/PHIElimination-debugloc.mir | 2 +- llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir | 2 +- .../AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir | 2 +- .../AMDGPU/early-lis-two-address-partial-def.mir | 2 +- llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir | 2 +- llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir | 2 +- llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir | 2 +- llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir | 2 +- .../CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir | 2 +- llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir | 2 +- llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir | 4 ++-- llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir | 2 +- llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir | 2 +- llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir | 2 +- llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir | 2 +- llvm/test/CodeGen/PowerPC/livevars-crash1.mir | 2 +- llvm/test/CodeGen/PowerPC/livevars-crash2.mir | 2 +- llvm/test/CodeGen/X86/distancemap.mir | 2 +- llvm/test/CodeGen/X86/phielim-undef.mir | 2 +- llvm/test/CodeGen/X86/twoaddr-mul2.mir | 2 +- llvm/test/tools/llc/new-pm/pipeline.mir | 2 +- llvm/test/tools/llc/new-pm/verify.mir | 2 +- llvm/tools/llc/NewPMDriver.cpp | 12 ++++++++---- llvm/tools/llc/NewPMDriver.h | 4 +++- llvm/tools/llc/llc.cpp | 11 ++++++++++- 27 files changed, 48 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index eb15beb835b53..64d20ae5b20ef 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -548,6 +548,9 @@ Error CodeGenPassBuilder::buildPipeline( if (auto Err = derived().addMachinePasses(addPass)) return std::move(Err); + if (!Opt.DisableVerify) + addPass(MachineVerifierPass()); + if (PrintAsm) { derived().addAsmPrinter( addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) { diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir index 8f43686429268..4e09580aeb218 100644 --- a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir +++ b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir @@ -1,7 +1,7 @@ # RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \ # RUN: -run-pass=livevars,phi-node-elimination,twoaddressinstruction \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 -# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \ +# RUN: llc -mtriple=aarch64-linux-gnu -verify-each -o /dev/null %s \ # RUN: --passes='require,phi-node-elimination,two-address-instruction' \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir index 9b8283352161a..01c44e3f253bb 100644 --- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir +++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir @@ -2,7 +2,7 @@ # RUN: -run-pass=livevars,phi-node-elimination,twoaddressinstruction \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \ # RUN: | FileCheck %s -# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s \ +# RUN: llc -mtriple=aarch64-linux-gnu -verify-each -o - %s \ # RUN: --passes='require,phi-node-elimination,two-address-instruction' \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \ # RUN: | FileCheck %s diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir index c483d669a2758..c1ddc9c14d814 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir +++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction -verify-each %s -o - | FileCheck %s # REQUIRES: aarch64-registered-target # Verify that the register class is correctly constrained after the twoaddress replacement diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir index e84c51b73ad1e..75148ecff5377 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -early-live-intervals -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='require,two-address-instruction' -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='require,two-address-instruction' -verify-each -o - %s | FileCheck %s --- name: dyn_extract_v7f64_v_v diff --git a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir index bf111d19f2147..186b171f4e805 100644 --- a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir +++ b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX90A %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes='require,two-address-instruction' -o - %s | FileCheck --check-prefix=GFX90A %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes='require,two-address-instruction' -verify-each -o - %s | FileCheck --check-prefix=GFX90A %s --- name: aligned_partial_vgpr_64 diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir index 0f20b8a2f1e29..1768e39d1a06c 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX10 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX10 %s # GFX10-LABEL: name: test_fmamk_reg_imm_f16 # GFX10: %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir index 91ade8806e4d1..820b8579bd0a4 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s --- name: test_fmamk_reg_imm_f16 diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 8b009978055ac..829d01d8e1c36 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple amdgcn --passes='require,phi-node-elimination' -o - %s | FileCheck %s +# RUN: llc -mtriple amdgcn --passes='require,phi-node-elimination' -verify-each -o - %s | FileCheck %s # CHECK-LABEL: phi-cf-test # CHECK: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir index dfeca8db0b464..0ee768466509a 100644 --- a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass liveintervals,phi-node-elimination -o - %s | FileCheck -check-prefixes=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --passes='require,phi-node-elimination' -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --passes='require,phi-node-elimination' -verify-each -o - %s | FileCheck -check-prefixes=GCN %s # This checks liveintervals pass verification and phi-node-elimination correctly preserves them. diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir index 4bb0046c0ee01..77032ffcf18a9 100644 --- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir +++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=livevars,phi-node-elimination,twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='require,phi-node-elimination,two-address-instruction' -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='require,phi-node-elimination,two-address-instruction' -verify-each -o - %s | FileCheck %s # This used to fail under ASAN enabled build because we didn't update LiveVariables in SIInstrInfo::convertToThreeAddress() # CHECK: _amdgpu_ps_main diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir index ab6d207cd9668..b0a75a526cf2b 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_fmamk_reg_imm_f64 # GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir index daac34bab0fd0..1c444eca7675c 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir @@ -1,7 +1,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s # GCN-LABEL: name: test_fmamk_reg_imm_f32 # GCN: %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir index 7062878a84609..9b1deb9aa9f73 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_madmk_reg_imm_f32 # GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir index 5fbb149548909..98d2eca213aae 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 # GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir index c533a5a167ab2..d5753491dbad1 100644 --- a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir +++ b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir @@ -1,5 +1,5 @@ # RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -march hexagon --passes='require,two-address-instruction' -o - %s | FileCheck %s +# RUN: llc -march hexagon --passes='require,two-address-instruction' -verify-each -o - %s | FileCheck %s ############################################################################### diff --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir index 2a669ed6b03a1..8bdf719f4bb5b 100644 --- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir +++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple powerpc64-unknown-linux-gnu -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple powerpc64-unknown-linux-gnu --passes='require,phi-node-elimination' -o - %s | FileCheck %s +# RUN: llc -mtriple powerpc64-unknown-linux-gnu --passes='require,phi-node-elimination' -verify-each -o - %s | FileCheck %s # This test case was originally known as # test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir index 68d2e5a627e9d..ec01a3c326cfa 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir @@ -2,7 +2,7 @@ # RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ # RUN: FileCheck %s # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ -# RUN: --passes='require,phi-node-elimination' | \ +# RUN: --passes='require,phi-node-elimination' -verify-each | \ # RUN: FileCheck %s --- | diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir index e165c85d5b72a..deaae3936cefb 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir @@ -2,7 +2,7 @@ # RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ # RUN: FileCheck %s # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ -# RUN: --passes='require,phi-node-elimination' | \ +# RUN: --passes='require,phi-node-elimination' -verify-each | \ # RUN: FileCheck %s --- | diff --git a/llvm/test/CodeGen/X86/distancemap.mir b/llvm/test/CodeGen/X86/distancemap.mir index 0a2f422302bd3..8b0c4bfe9d000 100644 --- a/llvm/test/CodeGen/X86/distancemap.mir +++ b/llvm/test/CodeGen/X86/distancemap.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs | FileCheck %s -# RUN: llc %s -o - -mtriple=x86_64-unknown-linux --passes=two-address-instruction | FileCheck %s +# RUN: llc %s -o - -mtriple=x86_64-unknown-linux --passes=two-address-instruction -verify-each | FileCheck %s # In TwoAddressInstructionPass, new instructions should be added to DistanceMap. # In this case, function convertInstTo3Addr is called on the first ADD diff --git a/llvm/test/CodeGen/X86/phielim-undef.mir b/llvm/test/CodeGen/X86/phielim-undef.mir index cebc725537d0e..1200449b49c78 100644 --- a/llvm/test/CodeGen/X86/phielim-undef.mir +++ b/llvm/test/CodeGen/X86/phielim-undef.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s -run-pass=livevars,phi-node-elimination,twoaddressinstruction | FileCheck %s -# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s --passes='require,phi-node-elimination,two-address-instruction' | FileCheck %s +# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s --passes='require,phi-node-elimination,two-address-instruction' -verify-each | FileCheck %s --- | @b114 = external global i16, align 1 diff --git a/llvm/test/CodeGen/X86/twoaddr-mul2.mir b/llvm/test/CodeGen/X86/twoaddr-mul2.mir index e21005fa92397..b24c893cc11fc 100644 --- a/llvm/test/CodeGen/X86/twoaddr-mul2.mir +++ b/llvm/test/CodeGen/X86/twoaddr-mul2.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=x86_64-unknown -mcpu=haswell -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=x86_64-unknown -mcpu=haswell --passes=two-address-instruction -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=x86_64-unknown -mcpu=haswell --passes=two-address-instruction -verify-each %s -o - | FileCheck %s # Check that we don't have any uses of [[COPY]] after it is killed. --- diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir index ee058f763b779..761a3a424ee67 100644 --- a/llvm/test/tools/llc/new-pm/pipeline.mir +++ b/llvm/test/tools/llc/new-pm/pipeline.mir @@ -1,7 +1,7 @@ # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes='require,print' -print-pipeline-passes < %s | FileCheck --check-prefix=ANALYSIS %s -# CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(print)) +# CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(verify,print)) # ANALYSIS: require,print diff --git a/llvm/test/tools/llc/new-pm/verify.mir b/llvm/test/tools/llc/new-pm/verify.mir index 0cc7fc837e5be..6aa4362811e04 100644 --- a/llvm/test/tools/llc/new-pm/verify.mir +++ b/llvm/test/tools/llc/new-pm/verify.mir @@ -1,4 +1,4 @@ -# RUN: not --crash llc -mtriple=x86_64-pc-linux-gnu -debug-pass-manager -passes='module(function(machine-function(trigger-verifier-error)))' -filetype=null %s 2>&1 | FileCheck %s +# RUN: not --crash llc -mtriple=x86_64-pc-linux-gnu -debug-pass-manager -passes='module(function(machine-function(trigger-verifier-error)))' -verify-each -filetype=null %s 2>&1 | FileCheck %s # CHECK: Verifying machine function f # CHECK: Broken machine function found after pass "TriggerVerifierErrorPass" diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp index c8088da49a278..2a49eaff1d7cb 100644 --- a/llvm/tools/llc/NewPMDriver.cpp +++ b/llvm/tools/llc/NewPMDriver.cpp @@ -18,8 +18,10 @@ #include "llvm/CodeGen/CommandFlags.h" #include "llvm/CodeGen/MIRParser/MIRParser.h" #include "llvm/CodeGen/MIRPrinter.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" @@ -29,7 +31,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/Passes/CodeGenPassBuilder.h" // TODO: Include pass headers properly. #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/StandardInstrumentations.h" #include "llvm/Support/CommandLine.h" @@ -88,7 +89,7 @@ int llvm::compileModuleWithNewPM( StringRef Arg0, std::unique_ptr M, std::unique_ptr MIR, std::unique_ptr Target, std::unique_ptr Out, std::unique_ptr DwoOut, LLVMContext &Context, - const TargetLibraryInfoImpl &TLII, bool NoVerify, StringRef PassPipeline, + const TargetLibraryInfoImpl &TLII, VerifierKind VK, StringRef PassPipeline, CodeGenFileType FileType) { if (!PassPipeline.empty() && TargetPassConfig::hasLimitedCodeGenPipeline()) { @@ -104,14 +105,15 @@ int llvm::compileModuleWithNewPM( // Fetch options from TargetPassConfig CGPassBuilderOption Opt = getCGPassBuilderOption(); - Opt.DisableVerify = NoVerify; + Opt.DisableVerify = VK != VerifierKind::InputOutput; Opt.DebugPM = DebugPM; Opt.RegAlloc = RegAlloc; MachineModuleInfo MMI(&LLVMTM); PassInstrumentationCallbacks PIC; - StandardInstrumentations SI(Context, Opt.DebugPM, !NoVerify); + StandardInstrumentations SI(Context, Opt.DebugPM, + VK == VerifierKind::EachPass); registerCodeGenCallback(PIC, LLVMTM); MachineFunctionAnalysisManager MFAM; @@ -147,6 +149,8 @@ int llvm::compileModuleWithNewPM( ExitOnErr(PB.parsePassPipeline(MPM, PassPipeline)); MPM.addPass(PrintMIRPreparePass(*OS)); MachineFunctionPassManager MFPM; + if (VK == VerifierKind::InputOutput) + MFPM.addPass(MachineVerifierPass()); MFPM.addPass(PrintMIRPass(*OS)); FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); diff --git a/llvm/tools/llc/NewPMDriver.h b/llvm/tools/llc/NewPMDriver.h index b0beeaf596c8f..c8a60223cb296 100644 --- a/llvm/tools/llc/NewPMDriver.h +++ b/llvm/tools/llc/NewPMDriver.h @@ -32,6 +32,8 @@ class ToolOutputFile; class LLVMContext; class MIRParser; +enum class VerifierKind { None, InputOutput, EachPass }; + struct LLCDiagnosticHandler : public DiagnosticHandler { bool handleDiagnostics(const DiagnosticInfo &DI) override; }; @@ -42,7 +44,7 @@ int compileModuleWithNewPM(StringRef Arg0, std::unique_ptr M, std::unique_ptr Out, std::unique_ptr DwoOut, LLVMContext &Context, - const TargetLibraryInfoImpl &TLII, bool NoVerify, + const TargetLibraryInfoImpl &TLII, VerifierKind VK, StringRef PassPipeline, CodeGenFileType FileType); } // namespace llvm diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp index 80c84a977c26c..3f27a5fd1a0eb 100644 --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -130,6 +130,9 @@ static cl::opt SplitDwarfFile( static cl::opt NoVerify("disable-verify", cl::Hidden, cl::desc("Do not verify input module")); +static cl::opt VerifyEach("verify-each", + cl::desc("Verify after each transform")); + static cl::opt DisableSimplifyLibCalls("disable-simplify-libcalls", cl::desc("Disable simplify-libcalls")); @@ -647,10 +650,16 @@ static int compileModule(char **argv, LLVMContext &Context) { WithColor::warning(errs(), argv[0]) << ": warning: ignoring -mc-relax-all because filetype != obj"; + VerifierKind VK = VerifierKind::InputOutput; + if (NoVerify) + VK = VerifierKind::None; + else if (VerifyEach) + VK = VerifierKind::EachPass; + if (EnableNewPassManager || !PassPipeline.empty()) { return compileModuleWithNewPM(argv[0], std::move(M), std::move(MIR), std::move(Target), std::move(Out), - std::move(DwoOut), Context, TLII, NoVerify, + std::move(DwoOut), Context, TLII, VK, PassPipeline, codegen::getFileType()); } From deeafeab815ddfe7b507e9f79fe8f992265a9f3b Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Wed, 4 Sep 2024 13:38:55 +0400 Subject: [PATCH 058/425] [lldb][NFC] Move few static helpers to the class Socket (#106640) Fixed a typo in Socket::SetOption(). --- lldb/include/lldb/Host/Socket.h | 15 +++++++++-- lldb/source/Host/common/Socket.cpp | 36 +++++++++++++++++++-------- lldb/source/Host/common/TCPSocket.cpp | 35 +++++--------------------- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h index 304a91bdf6741..764a048976eb4 100644 --- a/lldb/include/lldb/Host/Socket.h +++ b/lldb/include/lldb/Host/Socket.h @@ -112,8 +112,17 @@ class Socket : public IOObject { static llvm::Expected> UdpConnect(llvm::StringRef host_and_port, bool child_processes_inherit); - int GetOption(int level, int option_name, int &option_value); - int SetOption(int level, int option_name, int option_value); + static int GetOption(NativeSocket sockfd, int level, int option_name, + int &option_value); + int GetOption(int level, int option_name, int &option_value) { + return GetOption(m_socket, level, option_name, option_value); + }; + + static int SetOption(NativeSocket sockfd, int level, int option_name, + int option_value); + int SetOption(int level, int option_name, int option_value) { + return SetOption(m_socket, level, option_name, option_value); + }; NativeSocket GetNativeSocket() const { return m_socket; } SocketProtocol GetSocketProtocol() const { return m_protocol; } @@ -138,6 +147,8 @@ class Socket : public IOObject { virtual size_t Send(const void *buf, const size_t num_bytes); + static int CloseSocket(NativeSocket sockfd); + static Status GetLastError(); static void SetLastError(Status &error); static NativeSocket CreateSocket(const int domain, const int type, const int protocol, diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp index 1a506aa95b246..1a63571b94c6f 100644 --- a/lldb/source/Host/common/Socket.cpp +++ b/lldb/source/Host/common/Socket.cpp @@ -386,11 +386,7 @@ Status Socket::Close() { LLDB_LOGF(log, "%p Socket::Close (fd = %" PRIu64 ")", static_cast(this), static_cast(m_socket)); -#if defined(_WIN32) - bool success = closesocket(m_socket) == 0; -#else - bool success = ::close(m_socket) == 0; -#endif + bool success = CloseSocket(m_socket) == 0; // A reference to a FD was passed in, set it to an invalid value m_socket = kInvalidSocketValue; if (!success) { @@ -400,18 +396,20 @@ Status Socket::Close() { return error; } -int Socket::GetOption(int level, int option_name, int &option_value) { +int Socket::GetOption(NativeSocket sockfd, int level, int option_name, + int &option_value) { get_socket_option_arg_type option_value_p = reinterpret_cast(&option_value); socklen_t option_value_size = sizeof(int); - return ::getsockopt(m_socket, level, option_name, option_value_p, + return ::getsockopt(sockfd, level, option_name, option_value_p, &option_value_size); } -int Socket::SetOption(int level, int option_name, int option_value) { +int Socket::SetOption(NativeSocket sockfd, int level, int option_name, + int option_value) { set_socket_option_arg_type option_value_p = - reinterpret_cast(&option_value); - return ::setsockopt(m_socket, level, option_name, option_value_p, + reinterpret_cast(&option_value); + return ::setsockopt(sockfd, level, option_name, option_value_p, sizeof(option_value)); } @@ -427,6 +425,24 @@ void Socket::SetLastError(Status &error) { #endif } +Status Socket::GetLastError() { + std::error_code EC; +#ifdef _WIN32 + EC = llvm::mapWindowsError(WSAGetLastError()); +#else + EC = std::error_code(errno, std::generic_category()); +#endif + return EC; +} + +int Socket::CloseSocket(NativeSocket sockfd) { +#ifdef _WIN32 + return ::closesocket(sockfd); +#else + return ::close(sockfd); +#endif +} + NativeSocket Socket::CreateSocket(const int domain, const int type, const int protocol, bool child_processes_inherit, Status &error) { diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp index fc005814308d9..4f1518ef697ff 100644 --- a/lldb/source/Host/common/TCPSocket.cpp +++ b/lldb/source/Host/common/TCPSocket.cpp @@ -33,28 +33,9 @@ #include #endif -#ifdef _WIN32 -#define CLOSE_SOCKET closesocket -typedef const char *set_socket_option_arg_type; -#else -#include -#define CLOSE_SOCKET ::close -typedef const void *set_socket_option_arg_type; -#endif - using namespace lldb; using namespace lldb_private; -static Status GetLastSocketError() { - std::error_code EC; -#ifdef _WIN32 - EC = llvm::mapWindowsError(WSAGetLastError()); -#else - EC = std::error_code(errno, std::generic_category()); -#endif - return EC; -} - static const int kType = SOCK_STREAM; TCPSocket::TCPSocket(bool should_close, bool child_processes_inherit) @@ -208,12 +189,8 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) { continue; // enable local address reuse - int option_value = 1; - set_socket_option_arg_type option_value_p = - reinterpret_cast(&option_value); - if (::setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, option_value_p, - sizeof(option_value)) == -1) { - CLOSE_SOCKET(fd); + if (SetOption(fd, SOL_SOCKET, SO_REUSEADDR, 1) == -1) { + CloseSocket(fd); continue; } @@ -229,8 +206,8 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) { err = ::listen(fd, backlog); if (err == -1) { - error = GetLastSocketError(); - CLOSE_SOCKET(fd); + error = GetLastError(); + CloseSocket(fd); continue; } @@ -251,7 +228,7 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) { void TCPSocket::CloseListenSockets() { for (auto socket : m_listen_sockets) - CLOSE_SOCKET(socket.first); + CloseSocket(socket.first); m_listen_sockets.clear(); } @@ -280,7 +257,7 @@ llvm::Expected> TCPSocket::Accept( const lldb_private::SocketAddress &AddrIn = m_listen_sockets[fd]; if (!AddrIn.IsAnyAddr() && AcceptAddr != AddrIn) { - CLOSE_SOCKET(sock); + CloseSocket(sock); LLDB_LOG(log, "rejecting incoming connection from {0} (expecting {1})", AcceptAddr.GetIPAddress(), AddrIn.GetIPAddress()); return; From 59093cae8681fe5d3d951905887b67a99acf76e6 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 4 Sep 2024 10:39:43 +0100 Subject: [PATCH 059/425] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions (#97058) This patch adds intrinsics for LUTI2 and LUTI4 instructions, which use SVE registers, as specified in the https://github.com/ARM-software/acle/pull/324 --- clang/include/clang/Basic/arm_sve.td | 20 +- .../aarch64-sve2-intrinsics/acle_sve2_luti.c | 337 ++++++++++++++++++ .../acle_sve2_imm_lane.cpp | 32 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 20 ++ llvm/lib/Target/AArch64/SVEInstrFormats.td | 36 +- .../CodeGen/AArch64/sve2-intrinsics-luti.ll | 107 ++++++ 6 files changed, 550 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 078373823a3b6..81527d8b98760 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1939,6 +1939,24 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [VerifyRunti def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>; } +//////////////////////////////////////////////////////////////////////////////// +// SVE2 - Lookup table +let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in { + def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>; + + def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>; + def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + + def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; +} + +let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in { + def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>; + def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; +} + //////////////////////////////////////////////////////////////////////////////// // SVE2 - Optional @@ -2400,4 +2418,4 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; -} +} \ No newline at end of file diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c new file mode 100644 index 0000000000000..60c4828c407e8 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c @@ -0,0 +1,337 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -Wall -o /dev/null %s +#include + +#if defined __ARM_FEATURE_SME +#define MODE_ATTR __arm_streaming +#else +#define MODE_ATTR +#endif + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// SME-CHECK-LABEL: @test_svluti2_lane_s8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_s8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_u8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 3) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_s16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_s16u11__SVInt16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_u16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_u16u12__SVUint16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_f16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 5) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_f16u13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 5) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_bf16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svluti2_lane_bf16u14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti2.lane.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_s8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 1) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv16i8( [[TABLE:%.*]], [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_s16u11__SVInt16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_u16u12__SVUint16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8i16( [[TABLE:%.*]], [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_f16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_f16u13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8f16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_bf16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svluti4_lane_bf16u14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.nxv8bf16( [[TABLE:%.*]], [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_s16_x211svint16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_u16_x212svuint16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8i16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_f16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.x2.nxv8f16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8f16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_f16_x213svfloat16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8f16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.x2.nxv8bf16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z25test_svluti4_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16( [[TMP0]], [[TMP1]], [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1); +} diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp index bca063385420a..e405077b3de93 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp @@ -78,6 +78,14 @@ void test_range_0_7() SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1); } void test_range_0_3() @@ -146,6 +154,26 @@ void test_range_0_3() SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1); } void test_range_0_1() @@ -180,4 +208,8 @@ void test_range_0_1() SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1); } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 3735bf5222fce..6c50b18ee583f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1288,6 +1288,13 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + class SVE2_LUTI_Inrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + class SVE2_1VectorArg_Long_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, @@ -2682,6 +2689,19 @@ def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">, def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic; def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic; +// +// SVE2 - Lookup Table +// + +def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + // // SVE2 - Optional bit permutation // diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a1b960fa8a58a..d6d503171a41e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10412,6 +10412,15 @@ multiclass sve2_luti2_vector_index { let Inst{23-22} = idx{2-1}; let Inst{12} = idx{0}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; } // FP8 Look up table read with 4-bit indices @@ -10424,14 +10433,39 @@ multiclass sve2_luti4_vector_index { bits<2> idx; let Inst{23-22} = idx; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; } // FP8 Look up table read with 4-bit indices (two contiguous registers) multiclass sve2_luti4_vector_vg2_index { - def _H : sve2_lut_vector_index { + def NAME : sve2_lut_vector_index { bits<2> idx; let Inst{23-22} = idx; } + + def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8i16 (!cast(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; + def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8f16 (!cast(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; + def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8bf16 (!cast(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, + nxv8bf16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll new file mode 100644 index 0000000000000..5cea7536e1f3c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+sve2,+lut,+bf16 | FileCheck %s + +define @test_luti2_lane_i8( %table, %indices){ +; CHECK-LABEL: test_luti2_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.b, { z0.b }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti2.lane.nxv16i8( %table, %indices, i32 0) + ret %res +} + +define @test_luti2_lane_i16( %table, %indices){ +; CHECK-LABEL: test_luti2_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti2.lane.nxv8i16( %table, %indices, i32 0) + ret %res +} + +define @test_luti2_lane_f16( %table, %indices){ +; CHECK-LABEL: test_luti2_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti2.lane.nxv8f16( %table, %indices, i32 0) + ret %res +} + +define @test_luti2_lane_bf16( %table, %indices){ +; CHECK-LABEL: test_luti2_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti2.lane.nxv8bf16( %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_i8( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.b, { z0.b }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.nxv16i8( %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_i16( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.nxv8i16( %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_f16( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.nxv8f16( %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_bf16( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.nxv8bf16( %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_i16_x2( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_i16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8i16( %table, %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_f16_x2( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_f16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8f16( %table, %table, %indices, i32 0) + ret %res +} + +define @test_luti4_lane_bf16_x2( %table, %indices){ +; CHECK-LABEL: test_luti4_lane_bf16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16( %table, %table, %indices, i32 0) + ret %res +} From 3e948eb3e88d89107406ca0812934bea42101e3a Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 4 Sep 2024 10:39:59 +0100 Subject: [PATCH 060/425] [AArch64][NEON] Add intrinsics for LUTI (#96883) This patch adds intrinsics for NEON LUTI2 and LUTI4 instructions as specified in the [ACLE proposal](https://github.com/ARM-software/acle/pull/324) --- clang/include/clang/Basic/arm_neon.td | 19 + clang/lib/CodeGen/CGBuiltin.cpp | 89 +++ clang/test/CodeGen/aarch64-neon-luti.c | 506 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 35 ++ .../lib/Target/AArch64/AArch64InstrFormats.td | 14 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 40 ++ llvm/test/CodeGen/AArch64/neon-luti.ll | 253 +++++++++ 7 files changed, 949 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-neon-luti.c create mode 100644 llvm/test/CodeGen/AArch64/neon-luti.ll diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 3098fa67e6a51..536c0652280b9 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2096,3 +2096,22 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">; def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; } + +// Lookup table read with 2-bit/4-bit indices +let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { + def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc">; + def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc">; + def VLUTI2_H : SInst<"vluti2_lane", "Q.(; + def VLUTI2_H_Q : SInst<"vluti2_laneq", "Q.(; + def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPc">; + def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPc">; + def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(; + def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(; + + let ArchGuard = "defined(__aarch64__)", TargetGuard= "lut,bf16" in { + def VLUTI2_BF : SInst<"vluti2_lane", "Q.(; + def VLUTI2_BF_Q : SInst<"vluti2_laneq", "Q.(; + def VLUTI4_BF_X2 : SInst<"vluti4_lane_x2", ".2(; + def VLUTI4_BF_X2_Q : SInst<"vluti4_laneq_x2", ".2(; + } +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 012b43b8770be..e826c1c6fbbd2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13481,6 +13481,95 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_suqadd; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); } + + case NEON::BI__builtin_neon_vluti2_laneq_bf16: + case NEON::BI__builtin_neon_vluti2_laneq_f16: + case NEON::BI__builtin_neon_vluti2_laneq_p16: + case NEON::BI__builtin_neon_vluti2_laneq_p8: + case NEON::BI__builtin_neon_vluti2_laneq_s16: + case NEON::BI__builtin_neon_vluti2_laneq_s8: + case NEON::BI__builtin_neon_vluti2_laneq_u16: + case NEON::BI__builtin_neon_vluti2_laneq_u8: { + Int = Intrinsic::aarch64_neon_vluti2_laneq; + llvm::Type *Tys[2]; + Tys[0] = Ty; + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); + } + case NEON::BI__builtin_neon_vluti2q_laneq_bf16: + case NEON::BI__builtin_neon_vluti2q_laneq_f16: + case NEON::BI__builtin_neon_vluti2q_laneq_p16: + case NEON::BI__builtin_neon_vluti2q_laneq_p8: + case NEON::BI__builtin_neon_vluti2q_laneq_s16: + case NEON::BI__builtin_neon_vluti2q_laneq_s8: + case NEON::BI__builtin_neon_vluti2q_laneq_u16: + case NEON::BI__builtin_neon_vluti2q_laneq_u8: { + Int = Intrinsic::aarch64_neon_vluti2_laneq; + llvm::Type *Tys[2]; + Tys[0] = Ty; + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ true)); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); + } + case NEON::BI__builtin_neon_vluti2_lane_bf16: + case NEON::BI__builtin_neon_vluti2_lane_f16: + case NEON::BI__builtin_neon_vluti2_lane_p16: + case NEON::BI__builtin_neon_vluti2_lane_p8: + case NEON::BI__builtin_neon_vluti2_lane_s16: + case NEON::BI__builtin_neon_vluti2_lane_s8: + case NEON::BI__builtin_neon_vluti2_lane_u16: + case NEON::BI__builtin_neon_vluti2_lane_u8: { + Int = Intrinsic::aarch64_neon_vluti2_lane; + llvm::Type *Tys[2]; + Tys[0] = Ty; + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); + } + case NEON::BI__builtin_neon_vluti2q_lane_bf16: + case NEON::BI__builtin_neon_vluti2q_lane_f16: + case NEON::BI__builtin_neon_vluti2q_lane_p16: + case NEON::BI__builtin_neon_vluti2q_lane_p8: + case NEON::BI__builtin_neon_vluti2q_lane_s16: + case NEON::BI__builtin_neon_vluti2q_lane_s8: + case NEON::BI__builtin_neon_vluti2q_lane_u16: + case NEON::BI__builtin_neon_vluti2q_lane_u8: { + Int = Intrinsic::aarch64_neon_vluti2_lane; + llvm::Type *Tys[2]; + Tys[0] = Ty; + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ true)); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); + } + case NEON::BI__builtin_neon_vluti4q_lane_p8: + case NEON::BI__builtin_neon_vluti4q_lane_s8: + case NEON::BI__builtin_neon_vluti4q_lane_u8: { + Int = Intrinsic::aarch64_neon_vluti4q_lane; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane"); + } + case NEON::BI__builtin_neon_vluti4q_laneq_p8: + case NEON::BI__builtin_neon_vluti4q_laneq_s8: + case NEON::BI__builtin_neon_vluti4q_laneq_u8: { + Int = Intrinsic::aarch64_neon_vluti4q_laneq; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq"); + } + case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2: + case NEON::BI__builtin_neon_vluti4q_lane_f16_x2: + case NEON::BI__builtin_neon_vluti4q_lane_p16_x2: + case NEON::BI__builtin_neon_vluti4q_lane_s16_x2: + case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: { + Int = Intrinsic::aarch64_neon_vluti4q_lane_x2; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane_x2"); + } + case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: { + Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); + } } } diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c new file mode 100644 index 0000000000000..72cb6bcdb40f0 --- /dev/null +++ b/clang/test/CodeGen/aarch64-neon-luti.c @@ -0,0 +1,506 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +#include +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2_lane_u8(uint8x8_t vn, uint8x8_t vm) { + return vluti2_lane_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_u8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +uint8x16_t test_vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_u8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +uint8x16_t test_vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_u8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2_lane_s8(int8x8_t vn, uint8x8_t vm) { + return vluti2_lane_s8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_s8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +int8x16_t test_vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_s8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_s8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +int8x16_t test_vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_s8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2_lane_p8(poly8x8_t vn, uint8x8_t vm) { + return vluti2_lane_p8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_p8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +poly8x16_t test_vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_p8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_p8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +poly8x16_t test_vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_p8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +uint16x8_t test_vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm) { + return vluti2_lane_u16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_u16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +uint16x8_t test_vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_u16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +uint16x8_t test_vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_u16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_u16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +uint16x8_t test_vluti2q_laneq_u16(uint16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_u16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +int16x8_t test_vluti2_lane_s16(int16x4_t vn, uint8x8_t vm) { + return vluti2_lane_s16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_s16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +int16x8_t test_vluti2_laneq_s16(int16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_s16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +int16x8_t test_vluti2q_lane_s16(int16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_s16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_s16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +int16x8_t test_vluti2q_laneq_s16(int16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_s16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_lane_f16( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE1]] +// +float16x8_t test_vluti2_lane_f16(float16x4_t vn, uint8x8_t vm) { + return vluti2_lane_f16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_laneq_f16( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4f16(<4 x half> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANEQ1]] +// +float16x8_t test_vluti2_laneq_f16(float16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_f16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_lane_f16( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE1]] +// +float16x8_t test_vluti2q_lane_f16(float16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_f16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_laneq_f16( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANEQ1]] +// +float16x8_t test_vluti2q_laneq_f16(float16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_f16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_lane_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE1]] +// +bfloat16x8_t test_vluti2_lane_bf16(bfloat16x4_t vn, uint8x8_t vm) { + return vluti2_lane_bf16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_laneq_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANEQ1]] +// +bfloat16x8_t test_vluti2_laneq_bf16(bfloat16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_bf16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_lane_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<8 x bfloat> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE1]] +// +bfloat16x8_t test_vluti2q_lane_bf16(bfloat16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_bf16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_laneq_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANEQ1]] +// +bfloat16x8_t test_vluti2q_laneq_bf16(bfloat16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_bf16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +poly16x8_t test_vluti2_lane_p16(poly16x4_t vn, uint8x8_t vm) { + return vluti2_lane_p16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_p16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +poly16x8_t test_vluti2_laneq_p16(poly16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_p16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]] +// +poly16x8_t test_vluti2q_lane_p16(poly16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_p16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_p16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]] +// +poly16x8_t test_vluti2q_laneq_p16(poly16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_p16(vn, vm, 7); +} + +// + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]] +// +uint8x16_t test_vluti4q_lane_u8(uint8x16_t vn, uint8x8_t vm) { + return vluti4q_lane_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +uint8x16_t test_vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]] +// +int8x16_t test_vluti4q_lane_s8(int8x16_t vn, uint8x8_t vm) { + return vluti4q_lane_s8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +int8x16_t test_vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_s8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]] +// +poly8x16_t test_vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm) { + return vluti4q_lane_p8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +poly8x16_t test_vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_p8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_u16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]] +// +uint16x8_t test_vluti4q_lane_u16_x2(uint16x8x2_t vn, uint8x8_t vm) { + return vluti4q_lane_u16_x2(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_u16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +uint16x8_t test_vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_u16_x2(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_s16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]] +// +int16x8_t test_vluti4q_lane_s16_x2(int16x8x2_t vn, uint8x8_t vm) { + return vluti4q_lane_s16_x2(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_s16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +int16x8_t test_vluti4q_laneq_s16_x2(int16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_s16_x2(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_lane_f16_x2( +// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <8 x half> [[VLUTI4Q_LANE_X24]] +// +float16x8_t test_vluti4q_lane_f16_x2(float16x8x2_t vn, uint8x8_t vm) { + return vluti4q_lane_f16_x2(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_laneq_f16_x2( +// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <8 x half> [[VLUTI4Q_LANEQ_X24]] +// +float16x8_t test_vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_f16_x2(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_lane_bf16_x2( +// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 2) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI4Q_LANE_X24]] +// +bfloat16x8_t test_vluti4q_lane_bf16_x2(bfloat16x8x2_t vn, uint8x8_t vm) { + return vluti4q_lane_bf16_x2(vn, vm, 2); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_laneq_bf16_x2( +// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 2) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI4Q_LANEQ_X24]] +// +bfloat16x8_t test_vluti4q_laneq_bf16_x2(bfloat16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_bf16_x2(vn, vm, 2); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_p16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]] +// +poly16x8_t test_vluti4q_lane_p16_x2(poly16x8x2_t vn, uint8x8_t vm) { + return vluti4q_lane_p16_x2(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_p16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +poly16x8_t test_vluti4q_laneq_p16_x2(poly16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_p16_x2(vn, vm, 0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6c50b18ee583f..6727ee69d7b3e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -565,6 +565,41 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in { def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic; } +let TargetPrefix = "aarch64" in { +def int_aarch64_neon_vluti2_lane : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_v8i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_neon_vluti2_laneq : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_v16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_neon_vluti4q_lane: DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v8i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_neon_vluti4q_laneq: DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + + +def int_aarch64_neon_vluti4q_lane_x2: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, + llvm_v8i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_neon_vluti4q_laneq_x2: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, + llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +} + let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". class AdvSIMD_2Vector2Index_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index ab8251dc83014..16002011aedfb 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8239,11 +8239,11 @@ multiclass SIMDTableLookupTied { // AdvSIMD LUT //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTableLookupIndexed opc, RegisterOperand vectype, +class BaseSIMDTableLookupIndexed opc, RegisterOperand listtype, Operand idx_type, string asm, string kind> - : I<(outs vectype:$Rd), - (ins listtype:$Rn, vectype:$Rm, idx_type:$idx), + : I<(outs V128:$Rd), + (ins listtype:$Rn, V128:$Rm, idx_type:$idx), asm, "\t$Rd" # kind # ", $Rn, $Rm$idx", "", []>, Sched<[]> { bits<5> Rd; @@ -8263,22 +8263,22 @@ class BaseSIMDTableLookupIndexed opc, RegisterOperand vectype, } multiclass BaseSIMDTableLookupIndexed2 { - def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, V128, VecListOne16b, VectorIndexS, asm, ".16b"> { + def _B : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, VecListOne16b, VectorIndexS32b_timm, asm, ".16b"> { bits<2> idx; let Inst{14-13} = idx; } - def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, V128, VecListOne8h, VectorIndexH, asm, ".8h" > { + def _H : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, VecListOne8h, VectorIndexH32b_timm, asm, ".8h" > { bits<3> idx; let Inst{14-12} = idx; } } multiclass BaseSIMDTableLookupIndexed4 { - def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, V128, VecListOne16b, VectorIndexD, asm, ".16b"> { + def _B : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, VecListOne16b, VectorIndexD32b_timm, asm, ".16b"> { bit idx; let Inst{14} = idx; } - def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, V128, VecListTwo8h, VectorIndexS, asm, ".8h" > { + def _H : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, VecListTwo8h, VectorIndexS32b_timm, asm, ".8h" > { bits<2> idx; let Inst{14-13} = idx; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c659697c3a1be..ccef85bfaa8af 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6616,6 +6616,46 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), let Predicates = [HasLUT] in { defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">; defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">; + + multiclass Luti2_patterns{ + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT64:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), + VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT128:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; + } + + defm : Luti2_patterns; + defm : Luti2_patterns; + defm : Luti2_patterns; + defm : Luti2_patterns; + + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn, + v16i8:$Rm, i32:$idx)), + (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_lane v16i8:$Rn, + v8i8:$Rm, i32:$idx)), + (LUT4_B VecListOne16b:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexD32b_timm:$idx)>; + + foreach VT = [v8i16, v8f16, v8bf16] in { + def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1, + VT:$Rn2, v16i8:$Rm, i32:$idx)), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(VT (int_aarch64_neon_vluti4q_lane_x2 VT:$Rn1, + VT:$Rn2, v8i8:$Rm, i32:$idx)), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + } } //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/neon-luti.ll b/llvm/test/CodeGen/AArch64/neon-luti.ll new file mode 100644 index 0000000000000..5436662753762 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-luti.ll @@ -0,0 +1,253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon,+lut,+bf16 | FileCheck %s + +define <16 x i8> @test_luti2_lane_i8(<8 x i8> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> %vn, <8 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2_laneq_i8(<8 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <8 x i16> @test_luti2_lane_i16(<4 x i16> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> %vn, <8 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2_laneq_i16(<4 x i16> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> %vn, <16 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2q_lane_i16(<4 x i16> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<4 x i16> %vn, <8 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2q_laneq_i16(<8 x i16> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> %vn, <16 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x half> @test_luti2_lane_f16(<4 x half> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> %vn, <8 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2_laneq_f16(<4 x half> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4i16(<4 x half> %vn, <16 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2q_lane_f16(<8 x half> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> %vn, <8 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2q_laneq_f16(<8 x half> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> %vn, <16 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x bfloat> @test_luti2_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2_laneq_bf16(<4 x bfloat> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> %vn, <16 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2q_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2q_laneq_bf16(<8 x bfloat> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> %vn, <16 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <16 x i8> @test_luti4q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti4q_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti4 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti4q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <8 x i16> @test_luti4q_lane_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm){ +; CHECK-LABEL: test_luti4q_lane_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm, i32 1) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti4q_laneq_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm, i32 1) + ret <8 x i16> %res +} + +define <8 x half> @test_luti4q_lane_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <8 x i8> %vm){ +; CHECK-LABEL: test_luti4q_lane_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <8 x i8> %vm, i32 1) + ret <8 x half> %res +} + + +define <8 x half> @test_luti4q_laneq_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <16 x i8> %vm, i32 1) + ret <8 x half> %res +} + +define <8 x bfloat> @test_luti4q_laneq_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <16 x i8> %vm, i32 1) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti4q_lane_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <8 x i8> %vm){ +; CHECK-LABEL: test_luti4q_lane_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <8 x i8> %vm, i32 1) + ret <8 x bfloat> %res +} From 5a658ee933065d0e4ef1a65d9a6ddfba2874ee98 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 4 Sep 2024 09:42:18 +0000 Subject: [PATCH 061/425] [lldb][test] Skip some lldb-server tests on Windows These are known to return errors occasionaly on our Windows on Arm bot. --- lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py | 1 + lldb/test/API/tools/lldb-server/TestLldbGdbServer.py | 1 + lldb/test/API/tools/lldb-server/TestNonStop.py | 1 + 3 files changed, 3 insertions(+) diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py index ad84a40932c65..a2ac1fdb6270f 100644 --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py @@ -58,6 +58,7 @@ def test_launch_via_vRun(self): self.assertEqual(context["O_content"], b"arg1\r\narg2\r\narg3\r\n") @add_test_categories(["llgs"]) + @skipIfWindows # Sometimes returns '$E1f'. def test_launch_via_vRun_no_args(self): self.build() server = self.connect_to_debug_monitor() diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py index 93485cd32f519..592037db502aa 100644 --- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py +++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py @@ -468,6 +468,7 @@ def test_Hg_fails_on_zero_pid(self): self.Hg_fails_on_pid(0) @add_test_categories(["llgs"]) + @skipIfWindows # Sometimes returns '$E37'. def test_Hg_fails_on_minus_one_pid(self): self.build() self.set_inferior_startup_launch() diff --git a/lldb/test/API/tools/lldb-server/TestNonStop.py b/lldb/test/API/tools/lldb-server/TestNonStop.py index 62bda48ee049b..841de50818797 100644 --- a/lldb/test/API/tools/lldb-server/TestNonStop.py +++ b/lldb/test/API/tools/lldb-server/TestNonStop.py @@ -276,6 +276,7 @@ def test_multiple_vCont(self): self.expect_gdbremote_sequence() @add_test_categories(["llgs"]) + @skipIfWindows # Sometimes results in '$E37' instead of expected '$OK' def test_vCont_then_stop(self): self.build() self.set_inferior_startup_launch() From d25e24a0eb909b7604572d28d15cbe648ecccd90 Mon Sep 17 00:00:00 2001 From: Carlos Galvez Date: Wed, 4 Sep 2024 11:49:16 +0200 Subject: [PATCH 062/425] =?UTF-8?q?[clang-tidy]=20Suggest=20using=20reinte?= =?UTF-8?q?rpret=5Fcast=20in=20bugprone-casting-thro=E2=80=A6=20(#106784)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ugh-void reinterpret_cast is the equivalent construct, and more clearly expresses intent. Co-authored-by: Carlos Gálvez --- .../bugprone/CastingThroughVoidCheck.cpp | 4 +- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++ .../checks/bugprone/casting-through-void.rst | 21 +++++++++- .../bugprone/casting-through-void.cpp | 40 +++++++++---------- 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp index 9e714b4be4dfe..f0a9ace229740 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp @@ -38,7 +38,9 @@ void CastingThroughVoidCheck::check(const MatchFinder::MatchResult &Result) { const auto ST = *Result.Nodes.getNodeAs("source_type"); const auto VT = *Result.Nodes.getNodeAs("void_type"); const auto *CE = Result.Nodes.getNodeAs("cast"); - diag(CE->getExprLoc(), "do not cast %0 to %1 through %2") << ST << TT << VT; + diag(CE->getExprLoc(), + "do not cast %0 to %1 through %2; use reinterpret_cast instead") + << ST << TT << VT; } } // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index b001a6ad44669..6999c1ef2ea4b 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -104,6 +104,10 @@ New check aliases Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Improved :doc:`bugprone-casting-through-void + ` check to suggest replacing + the offending code with ``reinterpret_cast``, to more clearly express intent. + - Improved :doc:`modernize-use-std-format ` check to support replacing member function calls too. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst index a9ab478b9a82e..d9f94b6a3f20b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst @@ -3,7 +3,9 @@ bugprone-casting-through-void ============================= -Detects unsafe or redundant two-step casting operations involving ``void*``. +Detects unsafe or redundant two-step casting operations involving ``void*``, +which is equivalent to ``reinterpret_cast`` as per the +`C++ Standard `_. Two-step type conversions via ``void*`` are discouraged for several reasons. @@ -16,7 +18,17 @@ Two-step type conversions via ``void*`` are discouraged for several reasons. In summary, avoiding two-step type conversions through ``void*`` ensures clearer code, maintains essential compiler warnings, and prevents ambiguity and potential runtime -errors, particularly in complex inheritance scenarios. +errors, particularly in complex inheritance scenarios. If such a cast is wanted, +it shall be done via ``reinterpret_cast``, to express the intent more clearly. + +Note: it is expected that, after applying the suggested fix and using +``reinterpret_cast``, the check :doc:`cppcoreguidelines-pro-type-reinterpret-cast +<../cppcoreguidelines/pro-type-reinterpret-cast>` will emit a warning. +This is intentional: ``reinterpret_cast`` is a dangerous operation that can +easily break the strict aliasing rules when dereferencing the casted pointer, +invoking Undefined Behavior. The warning is there to prompt users to carefuly +analyze whether the usage of ``reinterpret_cast`` is safe, in which case the +warning may be suppressed. Examples: @@ -29,3 +41,8 @@ Examples: reinterpret_cast(reinterpret_cast(ptr)); // WRONG (IntegerPointer)(void *)ptr; // WRONG IntegerPointer(static_cast(ptr)); // WRONG + + reinterpret_cast(ptr); // OK, clearly expresses intent. + // NOTE: dereferencing this pointer violates + // the strict aliasing rules, invoking + // Undefined Behavior. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp index a784e49885873..68172212904f8 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp @@ -10,42 +10,42 @@ const double cd = 100; void normal_test() { static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'V' (aka 'void *') [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'V' (aka 'void *'); use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'int *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'int *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); } void const_pointer_test() { static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'V' (aka 'void *') [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'V' (aka 'void *'); use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); - // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'int *' to 'int *const' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'int *' to 'int *const' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); } void const_test() { static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const V' (aka 'void *const') [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const V' (aka 'void *const'); use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'int *' to 'const int *' through 'const void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'int *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&i)); static_cast(static_cast(&cd)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&cd)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const CV' (aka 'const void *const') [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const CV' (aka 'const void *const'); use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&ci)); - // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const int *' to 'const int *' through 'const void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const int *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(static_cast(&ci)); } @@ -53,11 +53,11 @@ void const_test() { void reinterpret_cast_test() { static_cast(reinterpret_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] reinterpret_cast(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] reinterpret_cast(reinterpret_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast(reinterpret_cast(&i)); reinterpret_cast(reinterpret_cast(&i)); @@ -66,11 +66,11 @@ void reinterpret_cast_test() { void c_style_cast_test() { static_cast((void *)&d); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] (int *)(void *)&d; - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast((void *)&d); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] static_cast((void *)&i); } @@ -82,12 +82,12 @@ using I = int *; void cxx_functional_cast() { A(static_cast(&d)); I(static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not cast 'double *' to 'I' (aka 'int *') through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not cast 'double *' to 'I' (aka 'int *') through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] } void bit_cast() { __builtin_bit_cast(int *, static_cast(&d)); - // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void] + // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void] } namespace PR87069 { From 519b36925cf2e1a59f76bd509471d2e1830169f0 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 4 Sep 2024 11:49:50 +0200 Subject: [PATCH 063/425] [LLD][COFF][NFC] Store impSym as DefinedImportData in ImportFile. (#107162) --- lld/COFF/InputFiles.cpp | 3 +-- lld/COFF/InputFiles.h | 2 +- lld/COFF/SymbolTable.cpp | 4 ++-- lld/COFF/SymbolTable.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index e1ea4ebeabc9b..50bc62312a6f8 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1059,8 +1059,7 @@ void ImportFile::parse() { // address pointed by the __imp_ symbol. (This allows you to call // DLL functions just like regular non-DLL functions.) if (hdr->getType() == llvm::COFF::IMPORT_CODE) - thunkSym = ctx.symtab.addImportThunk( - name, cast_or_null(impSym), hdr->Machine); + thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine); } BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index a332ac87b265e..8b3303a8d87f4 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -346,7 +346,7 @@ class ImportFile : public InputFile { static bool classof(const InputFile *f) { return f->kind() == ImportKind; } MachineTypes getMachineType() const override; - Symbol *impSym = nullptr; + DefinedImportData *impSym = nullptr; Symbol *thunkSym = nullptr; std::string dllName; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index a5f155bc05bc9..bb7583bb9a7df 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -771,12 +771,12 @@ Symbol *SymbolTable::addCommon(InputFile *f, StringRef n, uint64_t size, return s; } -Symbol *SymbolTable::addImportData(StringRef n, ImportFile *f) { +DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f) { auto [s, wasInserted] = insert(n, nullptr); s->isUsedInRegularObj = true; if (wasInserted || isa(s) || s->isLazy()) { replaceSymbol(s, n, f); - return s; + return cast(s); } reportDuplicate(s, f); diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index b5f95d2ad7f11..51c6c79ec1446 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -102,7 +102,7 @@ class SymbolTable { Symbol *addCommon(InputFile *f, StringRef n, uint64_t size, const llvm::object::coff_symbol_generic *s = nullptr, CommonChunk *c = nullptr); - Symbol *addImportData(StringRef n, ImportFile *f); + DefinedImportData *addImportData(StringRef n, ImportFile *f); Symbol *addImportThunk(StringRef name, DefinedImportData *s, uint16_t machine); void addLibcall(StringRef name); From 126d6f27102fca0d69dc50cf29a37442d18304cf Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 4 Sep 2024 11:03:22 +0100 Subject: [PATCH 064/425] [AMDGPU] Improve codegen for GFX10+ DPP reductions and scans (#107108) Use poison for an unused input to the permlanex16 intrinsic, to improve register allocation and avoid an unnecessary v_mov instruction. --- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 18 +- .../global-atomic-fadd.f32-no-rtn.ll | 9 +- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 27 +- .../atomic_optimizations_global_pointer.ll | 368 ++++---- .../atomic_optimizations_local_pointer.ll | 873 ++++++++---------- .../atomic_optimizations_pixelshader.ll | 24 +- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 5 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 22 +- .../AMDGPU/global_atomics_scan_fadd.ll | 250 ++--- .../AMDGPU/global_atomics_scan_fmax.ll | 156 ++-- .../AMDGPU/global_atomics_scan_fmin.ll | 156 ++-- .../AMDGPU/global_atomics_scan_fsub.ll | 250 ++--- 12 files changed, 877 insertions(+), 1281 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 95afc3fcc8d7d..f408a013d7a37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -421,9 +421,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce within each pair of rows (i.e. 32 lanes). assert(ST->hasPermLaneX16()); - Value *Permlanex16Call = B.CreateIntrinsic( - V->getType(), Intrinsic::amdgcn_permlanex16, - {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); + Value *Permlanex16Call = + B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, + {PoisonValue::get(AtomicTy), V, B.getInt32(0), + B.getInt32(0), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); if (ST->isWave32()) { return V; @@ -432,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, if (ST->hasPermLane64()) { // Reduce across the upper and lower 32 lanes. Value *Permlane64Call = - B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); + B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V); return buildNonAtomicBinOp(B, Op, V, Permlane64Call); } @@ -481,9 +482,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). assert(ST->hasPermLaneX16()); - Value *PermX = B.CreateIntrinsic( - V->getType(), Intrinsic::amdgcn_permlanex16, - {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); + Value *PermX = + B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, + {PoisonValue::get(AtomicTy), V, B.getInt32(-1), + B.getInt32(-1), B.getFalse(), B.getFalse()}); Value *UpdateDPPCall = B.CreateCall( UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), @@ -493,7 +495,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. Value *const Lane31 = B.CreateIntrinsic( - V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); + AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); Value *UpdateDPPCall = B.CreateCall( UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index fcd8c6fb0fe7c..9c634aba348d2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -285,6 +285,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE ; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 @@ -312,12 +313,12 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY11]], 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec - ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY11]], implicit $exec + ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY12]], implicit $exec ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index a8f9ed2e6fba9..fdce9d9258c88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -269,22 +269,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec - ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec - ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[COPY11]], 0, implicit $exec ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15 ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]] ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]] ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]] - ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]] - ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec - ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY14]], implicit $exec + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]] + ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY14]], implicit $exec + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY15]], implicit $exec ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} @@ -298,7 +299,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: bb.4.Flow: ; GFX11-NEXT: successors: %bb.6(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1 + ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1 ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.6 ; GFX11-NEXT: {{ $}} @@ -309,10 +310,10 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec - ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY16]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY17]], [[V_CMP_EQ_U32_e64_]], implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.6 (%ir-block.38): diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ed036a83b6143..9397aee37524f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1257,8 +1257,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -1320,8 +1319,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -1379,15 +1377,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1453,25 +1450,23 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -1518,15 +1513,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1596,22 +1590,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -3150,23 +3143,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -3265,23 +3256,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 @@ -3348,7 +3337,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -3363,27 +3351,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -3465,13 +3451,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3482,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -3563,13 +3547,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3580,29 +3563,27 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3690,7 +3671,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -3703,23 +3684,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 @@ -5024,8 +5004,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -5087,8 +5066,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -5146,15 +5124,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5220,25 +5197,23 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -5285,15 +5260,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5363,22 +5337,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -6959,23 +6932,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -7074,23 +7045,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 @@ -7157,7 +7126,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -7172,27 +7140,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -7274,13 +7240,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7291,31 +7256,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -7372,13 +7336,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7389,29 +7352,27 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7499,7 +7460,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -7512,23 +7473,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index c7296185422ce..6d0e0cc7869b3 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -880,8 +880,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -936,8 +935,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -986,21 +984,20 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -1053,15 +1050,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -1394,8 +1390,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1428,8 +1423,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 @@ -1460,16 +1454,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1502,16 +1494,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -2623,23 +2613,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -2732,20 +2720,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 @@ -2807,7 +2793,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -2822,27 +2807,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -2919,13 +2902,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -2936,28 +2918,28 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 @@ -3434,10 +3416,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v3, v1 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -3500,10 +3480,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -3560,11 +3538,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc @@ -3630,12 +3605,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo @@ -4528,8 +4500,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -4584,8 +4555,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -4634,21 +4604,20 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -4701,15 +4670,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -5042,8 +5010,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5076,8 +5043,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 @@ -5108,16 +5074,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -5150,16 +5114,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -6297,23 +6259,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -6406,20 +6366,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 @@ -6481,7 +6439,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -6496,27 +6453,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -6593,13 +6548,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -6610,28 +6564,28 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 @@ -7093,8 +7047,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -7149,8 +7102,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -7199,21 +7151,20 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -7266,20 +7217,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -7811,10 +7760,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -7894,10 +7841,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 @@ -7965,11 +7910,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -8061,16 +8003,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -8535,8 +8475,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -8591,8 +8530,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -8641,21 +8579,20 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -8708,15 +8645,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -9252,10 +9188,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -9335,10 +9269,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 @@ -9406,11 +9338,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -9502,16 +9431,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -9976,8 +9903,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -10032,8 +9958,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -10082,21 +10007,20 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -10149,15 +10073,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -10693,10 +10616,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -10776,10 +10697,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 @@ -10847,11 +10766,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -10943,16 +10859,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -11421,8 +11335,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -11477,8 +11390,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -11527,21 +11439,20 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -11594,20 +11505,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -12521,24 +12430,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 @@ -12639,22 +12546,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 @@ -12738,35 +12643,32 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -12871,25 +12773,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 @@ -13361,8 +13261,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -13417,8 +13316,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -13467,21 +13365,20 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -13534,20 +13431,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -14457,24 +14352,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 @@ -14574,22 +14467,20 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14672,35 +14563,32 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -14805,25 +14693,23 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 @@ -15290,8 +15176,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -15346,8 +15231,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -15396,21 +15280,20 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -15463,15 +15346,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -16375,24 +16257,22 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 @@ -16491,22 +16371,20 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 @@ -16588,35 +16466,32 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -16719,25 +16594,23 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 @@ -17209,8 +17082,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 @@ -17265,8 +17137,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 @@ -17315,21 +17186,20 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -17382,20 +17252,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -18295,24 +18163,22 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 @@ -18411,22 +18277,20 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 @@ -18508,35 +18372,32 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -18639,25 +18500,23 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index f67fcd6e0caf5..22eb8d05b5ff2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -396,8 +396,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s12 @@ -460,8 +459,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -518,21 +516,19 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 ; GFX1164-NEXT: v_mov_b32_e32 v2, s12 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[10:11] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 @@ -594,20 +590,18 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index ad3f920eadc91..6bd0b11acc3ea 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -186,8 +186,9 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec ; GFX11_GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 3951e02d46a8f..6766c0c1fdaeb 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -229,7 +229,9 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec @@ -241,8 +243,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]] ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} @@ -250,8 +252,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY7]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3.Flow: @@ -264,13 +266,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: bb.4 (%ir-block.32): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec - ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]] - ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec + ; GFX11-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]] + ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.38): diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 44cd2c6e3af67..584b280cefb8a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -917,8 +917,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -999,8 +998,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1070,16 +1068,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1139,15 +1135,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2155,8 +2150,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -2237,8 +2231,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2308,16 +2301,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2377,15 +2368,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -3453,8 +3443,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -3535,8 +3524,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3606,16 +3594,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3675,15 +3661,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4247,8 +4232,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -4329,8 +4313,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4400,16 +4383,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -4469,15 +4450,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -5570,8 +5550,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -5652,8 +5631,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5723,16 +5701,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -5805,16 +5781,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -7630,10 +7605,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 @@ -7760,10 +7733,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7881,11 +7852,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 @@ -8007,20 +7975,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -9212,10 +9178,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -9308,10 +9272,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9395,11 +9357,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -9489,20 +9448,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10671,10 +10628,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -10767,10 +10722,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10854,11 +10807,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -10948,20 +10898,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11612,10 +11560,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -11708,10 +11654,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11795,11 +11739,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -11889,20 +11830,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -13750,10 +13689,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 @@ -13880,10 +13817,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14001,11 +13936,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 @@ -14127,20 +14059,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index f0196fadc4b3f..464ec088dc297 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 @@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] @@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 @@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] @@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 @@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] @@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index f672c9c6afa22..26a0e34d18bdb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 @@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 @@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] @@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 @@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] @@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 @@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] @@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 2165a6ff65e3b..c158a8007bcc5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -995,8 +995,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -1077,8 +1076,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1148,16 +1146,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1230,16 +1226,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2345,8 +2340,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -2427,8 +2421,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2498,16 +2491,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2580,16 +2571,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -3695,8 +3685,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -3777,8 +3766,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3848,16 +3836,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3930,16 +3916,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4541,8 +4526,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -4623,8 +4607,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4694,16 +4677,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -4776,16 +4757,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -5890,8 +5870,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 @@ -5972,8 +5951,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6043,16 +6021,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -6125,16 +6101,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -7950,10 +7925,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 @@ -8080,10 +8053,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8201,11 +8172,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 @@ -8327,20 +8295,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -9531,10 +9497,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -9627,10 +9591,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9714,11 +9676,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -9808,20 +9767,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10990,10 +10947,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -11086,10 +11041,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11173,11 +11126,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -11267,20 +11217,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11931,10 +11879,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -12027,10 +11973,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12114,11 +12058,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 @@ -12208,20 +12149,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -14068,10 +14007,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 @@ -14198,10 +14135,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14319,11 +14254,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 @@ -14445,20 +14377,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: From 2fef449f30e2f484897cb199e3338a1520803c7d Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 4 Sep 2024 11:07:11 +0100 Subject: [PATCH 065/425] [LLVM][AArch64] Enable verifyTargetSDNode for scalable vectors and fix the fallout. (#104820) Fix incorrect use of AArch64ISD::UZP1/UUNPK{HI,LO} in: AArch64TargetLowering::LowerDIV AArch64TargetLowering::LowerINSERT_SUBVECTOR The latter highlighted DAG combines that relied on broken behaviour, which this patch also fixes. --- .../Target/AArch64/AArch64ISelLowering.cpp | 87 ++++++++++++++----- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1735ff5cd6974..5e3f9364ac3e1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14908,10 +14908,11 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, // NOP cast operands to the largest legal vector of the same element count. if (VT.isFloatingPoint()) { Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); - Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); + Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG); } else { // Legal integer vectors are already their largest so Vec0 is fine as is. Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); + Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1); } // To replace the top/bottom half of vector V with vector SubV we widen the @@ -14920,11 +14921,13 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SDValue Narrow; if (Idx == 0) { SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); + HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0); Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); } else { assert(Idx == InVT.getVectorMinNumElements() && "Invalid subvector index!"); SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); + LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0); Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); } @@ -15024,7 +15027,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); - return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); + SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo); + SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi); + return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast); } bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles( @@ -22739,7 +22744,19 @@ static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, SDValue Rshrnb = DAG.getNode( AArch64ISD::RSHRNB_I, DL, ResVT, {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb); + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb); +} + +static SDValue isNVCastToHalfWidthElements(SDValue V) { + if (V.getOpcode() != AArch64ISD::NVCAST) + return SDValue(); + + SDValue Op = V.getOperand(0); + if (V.getValueType().getVectorElementCount() != + Op.getValueType().getVectorElementCount() * 2) + return SDValue(); + + return Op; } static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, @@ -22802,25 +22819,37 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG)) return Urshr; - if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget)) - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1); + if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) { + if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) { + Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1); + } + } - if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget)) - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb); + if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) { + if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) { + Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb); + } + } - // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) - if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { - if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { - SDValue X = Op0.getOperand(0).getOperand(0); - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); + // uzp1(nvcast(unpklo(uzp1(x, y))), z) => uzp1(x, z) + if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) { + if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) { + if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue X = PreCast.getOperand(0).getOperand(0); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); + } } } - // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) - if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { - if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { - SDValue Z = Op1.getOperand(0).getOperand(1); - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); + // uzp1(x, nvcast(unpkhi(uzp1(y, z)))) => uzp1(x, z) + if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) { + if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) { + if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue Z = PreCast.getOperand(0).getOperand(1); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); + } } } @@ -29415,9 +29444,6 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { VT.isInteger() && "Expected integer vectors!"); assert(OpVT.getSizeInBits() == VT.getSizeInBits() && "Expected vectors of equal size!"); - // TODO: Enable assert once bogus creations have been fixed. - if (VT.isScalableVector()) - break; assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 && "Expected result vector with half the lanes of its input!"); break; @@ -29435,12 +29461,27 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { EVT Op1VT = N->getOperand(1).getValueType(); assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && "Expected vectors!"); - // TODO: Enable assert once bogus creations have been fixed. - if (VT.isScalableVector()) - break; assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!"); break; } + case AArch64ISD::RSHRNB_I: { + assert(N->getNumValues() == 1 && "Expected one result!"); + assert(N->getNumOperands() == 2 && "Expected two operands!"); + EVT VT = N->getValueType(0); + EVT Op0VT = N->getOperand(0).getValueType(); + EVT Op1VT = N->getOperand(1).getValueType(); + assert(VT.isVector() && VT.isInteger() && + "Expected integer vector result type!"); + assert(Op0VT.isVector() && Op0VT.isInteger() && + "Expected first operand to be an integer vector!"); + assert(VT.getSizeInBits() == Op0VT.getSizeInBits() && + "Expected vectors of equal size!"); + assert(VT.getVectorElementCount() == Op0VT.getVectorElementCount() * 2 && + "Expected input vector with half the lanes of its result!"); + assert(Op1VT == MVT::i32 && isa(N->getOperand(1)) && + "Expected second operand to be a constant i32!"); + break; + } } } #endif From 29c076b8598c9627cea493fdfc1a30c83385e820 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 12:11:08 +0200 Subject: [PATCH 066/425] [Lint] Fix crash with scalable alloca --- llvm/lib/Analysis/Lint.cpp | 4 ++-- llvm/test/Analysis/Lint/scalable.ll | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Analysis/Lint/scalable.ll diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index a44d5a3bbe462..29b5d38fc93b7 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -436,8 +436,8 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc, if (AllocaInst *AI = dyn_cast(Base)) { Type *ATy = AI->getAllocatedType(); - if (!AI->isArrayAllocation() && ATy->isSized()) - BaseSize = DL->getTypeAllocSize(ATy); + if (!AI->isArrayAllocation() && ATy->isSized() && !ATy->isScalableTy()) + BaseSize = DL->getTypeAllocSize(ATy).getFixedValue(); BaseAlign = AI->getAlign(); } else if (GlobalVariable *GV = dyn_cast(Base)) { // If the global may be defined differently in another compilation unit diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll new file mode 100644 index 0000000000000..73100eca8835c --- /dev/null +++ b/llvm/test/Analysis/Lint/scalable.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=lint < %s | FileCheck %s + +; Make sure we don't crash. + +define @test() { +; CHECK-LABEL: define @test() { +; CHECK-NEXT: [[A:%.*]] = alloca , align 8 +; CHECK-NEXT: [[V:%.*]] = load , ptr [[A]], align 8 +; CHECK-NEXT: ret [[V]] +; + %a = alloca + %v = load , ptr %a + ret %v +} From 4f3f09e787bab3caccd9496d93e6453c71d7869f Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 4 Sep 2024 11:13:10 +0100 Subject: [PATCH 067/425] [flang][debug] Add stride information for assumed shape array. (#106703) Without this information, debugger could present wrong values for arrays in certain cases as shown in issue #105646. Fixes #105646. --- .../Optimizer/Transforms/DebugTypeGenerator.cpp | 16 ++++++++++++---- .../Integration/debug-assumed-shape-array.f90 | 4 ++-- .../Transforms/debug-assumed-shape-array.fir | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 029d3776bcc0b..7c4382079fd6d 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -116,8 +116,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( unsigned offset = dimsOffset; const unsigned indexSize = dimsSize / 3; for ([[maybe_unused]] auto _ : seqTy.getShape()) { - // For each dimension, find the offset of count and lower bound in the - // descriptor and generate the dwarf expression to extract it. + // For each dimension, find the offset of count, lower bound and stride in + // the descriptor and generate the dwarf expression to extract it. // FIXME: If `indexSize` happens to be bigger than address size on the // system then we may have to change 'DW_OP_deref' here. addOp(llvm::dwarf::DW_OP_push_object_address, {}); @@ -139,10 +139,18 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( mlir::LLVM::DIExpressionAttr::get(context, ops); ops.clear(); + addOp(llvm::dwarf::DW_OP_push_object_address, {}); + addOp(llvm::dwarf::DW_OP_plus_uconst, + {offset + (indexSize * kDimStridePos)}); + addOp(llvm::dwarf::DW_OP_deref, {}); + // stride[i] = *(base_addr + offset + (indexSize * kDimStridePos)) + mlir::LLVM::DIExpressionAttr strideAttr = + mlir::LLVM::DIExpressionAttr::get(context, ops); + ops.clear(); + offset += dimsSize; mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get( - context, countAttr, lowerAttr, /*upperBound=*/nullptr, - /*stride=*/nullptr); + context, countAttr, lowerAttr, /*upperBound=*/nullptr, strideAttr); elements.push_back(subrangeTy); } return mlir::LLVM::DICompositeTypeAttr::get( diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90 index a23ffa13cf0ac..9a439e20d1981 100644 --- a/flang/test/Integration/debug-assumed-shape-array.f90 +++ b/flang/test/Integration/debug-assumed-shape-array.f90 @@ -8,6 +8,6 @@ end subroutine ff ! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref)) ! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]} -! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref)) -! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref)) +! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref)) +! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref)) diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir index 0a9b84ad253aa..d1e64297acea7 100644 --- a/flang/test/Transforms/debug-assumed-shape-array.fir +++ b/flang/test/Transforms/debug-assumed-shape-array.fir @@ -11,6 +11,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : #loc2 = loc("test1.f90":3:16) // CHECK: #llvm.di_composite_type, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>>, -// CHECK-SAME: #llvm.di_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>> +// CHECK-SAME: elements = #llvm.di_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>, +// CHECK-SAME: #llvm.di_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>> // CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>> From d77ccae4a629ba11b5c28f97222a8834c5e5c183 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 4 Sep 2024 10:22:58 +0000 Subject: [PATCH 068/425] [lldb] Fix 32 bit compile error https://lab.llvm.org/buildbot/#/builders/18/builds/3247/steps/4/logs/stdio In code added by https://github.com/llvm/llvm-project/issues/87471. --- lldb/source/Target/Process.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 6c5c5162e2468..97ce2c14458e9 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -3377,7 +3377,8 @@ lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high, mem.resize_for_overwrite(max_read_size); Status error; mem.resize(ReadMemory(cur_addr, mem.data(), - std::min(mem.size(), high - cur_addr), error)); + std::min(mem.size(), high - cur_addr), + error)); mem_pos = cur_addr; if (size > mem.size()) { // We didn't read enough data. Skip to the next memory region. From 4f130fa943af8bf47f4401deff0d825a91dc7584 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 4 Sep 2024 11:28:47 +0100 Subject: [PATCH 069/425] [flang][Driver] support -fno-openmp (#107087) Closes #83148 --- clang/include/clang/Driver/Options.td | 1 + flang/lib/Frontend/CompilerInvocation.cpp | 4 +++- flang/test/Driver/fno-openmp.f90 | 12 ++++++++++++ flang/test/Driver/fopenmp.f90 | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 flang/test/Driver/fno-openmp.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 83cf753e82484..8fe9f4f28f8fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3545,6 +3545,7 @@ def fopenmp : Flag<["-"], "fopenmp">, Group, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Parse OpenMP pragmas and generate parallel code.">; def fno_openmp : Flag<["-"], "fno-openmp">, Group, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Flags<[NoArgumentUnused]>; class OpenMPVersionHelp { string str = !strconcat( diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 1d73397d33017..9e42fcc2e39d5 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -968,7 +968,9 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, /// generated. static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - if (!args.hasArg(clang::driver::options::OPT_fopenmp)) + llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp, + clang::driver::options::OPT_fno_openmp); + if (!arg || arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) return true; unsigned numErrorsBefore = diags.getNumErrors(); diff --git a/flang/test/Driver/fno-openmp.f90 b/flang/test/Driver/fno-openmp.f90 new file mode 100644 index 0000000000000..98c8793c8c9bc --- /dev/null +++ b/flang/test/Driver/fno-openmp.f90 @@ -0,0 +1,12 @@ +! RUN: %flang_fc1 -fopenmp -fno-openmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-NO-OMP %s +! RUN: %flang_fc1 -fno-openmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-NO-OMP %s +! RUN: %flang_fc1 -fno-openmp -fopenmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-OMP %s +! RUN: %flang_fc1 -fopenmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-OMP %s + +subroutine main + ! CHECK-NO-OMP-NOT: omp.parallel + ! CHECK-OMP: omp.parallel + !$omp parallel + print *,"test" + !$omp end parallel +end subroutine diff --git a/flang/test/Driver/fopenmp.f90 b/flang/test/Driver/fopenmp.f90 index d70fe100c3d21..9b4dc5ffb1f69 100644 --- a/flang/test/Driver/fopenmp.f90 +++ b/flang/test/Driver/fopenmp.f90 @@ -11,6 +11,9 @@ ! RUN: %flang -target x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-NO-OPENMP --check-prefix=CHECK-WARNING ! RUN: %flang -target x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-OPENMP +!RUN: %flang -fno-openmp -fopenmp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-OPENMP +!RUN: %flang -fopenmp -fno-openmp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-NO-OPENMP + ! CHECK-FC1-OPENMP: "-fc1" ! CHECK-FC1-OPENMP: "-fopenmp" ! @@ -59,8 +62,14 @@ ! RUN: %flang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY ! RUN: %flang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD ! +! RUN: %flang -target x86_64-linux-gnu -fno-openmp -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY +! RUN: %flang -target x86_64-linux-gnu -fopenmp -fno-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-LD-ANY +! ! CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}" ! CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}" ! +! CHECK-NO-LD-ANY: "{{.*}}ld{{(.exe)?}}" +! CHECK-NO-LD-ANY-NOT: "-l{{(omp|gomp|iomp5)}}" +! ! CHECK-LD-ANYMD: "{{.*}}ld{{(.exe)?}}" ! CHECK-LD-ANYMD: "-l{{(omp|gomp|iomp5md)}}" From 5818337765e4c74918a700a14df5f64a658c47ee Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 4 Sep 2024 11:47:37 +0100 Subject: [PATCH 070/425] LICM: hoist BO assoc when (C1 op LV) op C2 (#106999) Extend hoistBOAssociation to handle the "(C1 op LV) op C2" case, when op is a commutative operand. --- llvm/lib/Transforms/Scalar/LICM.cpp | 17 ++++---- llvm/test/Transforms/LICM/hoist-binop.ll | 49 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 1aad182bb7433..5cf7c252bb5f3 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2803,15 +2803,15 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, /// Reassociate associative binary expressions of the form /// -/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" +/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" if op is an associative BinOp +/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is a commutative BinOp /// -/// where op is an associative binary op, LV is a loop variant, and C1 and C2 -/// are loop invariants that we want to hoist. +/// where LV is a loop variant, and C1 and C2 are loop invariants that we want +/// to hoist. /// /// TODO: This can be extended to more cases such as -/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" -/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative -/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative +/// 1. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" if op an associative BinOp +/// 2. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is a commutative BinOp static bool hoistBOAssociation(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, @@ -2830,11 +2830,14 @@ static bool hoistBOAssociation(Instruction &I, Loop &L, BO0->hasNUsesOrMore(3)) return false; - // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)" Value *LV = BO0->getOperand(0); Value *C1 = BO0->getOperand(1); Value *C2 = BO->getOperand(1); + if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1)) { + assert(BO0->isCommutative() && "Associativity implies commutativity"); + std::swap(LV, C1); + } if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2)) return false; diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll index 53c1df61931d7..b0ee45a5fb350 100644 --- a/llvm/test/Transforms/LICM/hoist-binop.ll +++ b/llvm/test/Transforms/LICM/hoist-binop.ll @@ -67,6 +67,31 @@ loop: br label %loop } + +; Hoist ADD and copy NUW if both ops have it. Commutative version. +define void @add_nuw_comm(i64 %c1, i64 %c2) { +; CHECK-LABEL: @add_nuw_comm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = add nuw i64 %c1, %index + call void @use(i64 %step.add) + %index.next = add nuw i64 %step.add, %c2 + br label %loop +} + ; Hoist MUL and drop NUW even if both ops have it. define void @mul_nuw(i64 %c1, i64 %c2) { ; CHECK-LABEL: @mul_nuw( @@ -91,6 +116,30 @@ loop: br label %loop } +; Hoist MUL and drop NUW even if both ops have it. Commutative version. +define void @mul_nuw_comm(i64 %c1, i64 %c2) { +; CHECK-LABEL: @mul_nuw_comm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = mul nuw i64 %c1, %index + call void @use(i64 %step.add) + %index.next = mul nuw i64 %step.add, %c2 + br label %loop +} + ; Hoist ADD but don't copy NUW if only one op has it. define void @add_no_nuw(i64 %c1, i64 %c2) { ; CHECK-LABEL: @add_no_nuw( From 5a6926ce49b6df807bff6083325ca291b0e731e5 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 4 Sep 2024 11:48:02 +0100 Subject: [PATCH 071/425] [AMDGPU] Fix test update after #107108 --- .../CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9397aee37524f..cc7050d08541a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -5351,7 +5351,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 From 360f82f3703fa57de42a2f998b172551f294e11a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 12:47:35 +0200 Subject: [PATCH 072/425] [Lint] Fix crash for insert/extract on scalable vector Don't assume the vector is fixed size. For scalable vectors, do not report an error, as indices outside the minimum range may be valid. --- llvm/lib/Analysis/Lint.cpp | 17 +++++++++-------- llvm/test/Analysis/Lint/scalable.ll | 25 +++++++++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 29b5d38fc93b7..415b16d25efd2 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -590,19 +590,20 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) { void Lint::visitExtractElementInst(ExtractElementInst &I) { if (ConstantInt *CI = dyn_cast(findValue(I.getIndexOperand(), - /*OffsetOk=*/false))) - Check( - CI->getValue().ult( - cast(I.getVectorOperandType())->getNumElements()), - "Undefined result: extractelement index out of range", &I); + /*OffsetOk=*/false))) { + ElementCount EC = I.getVectorOperandType()->getElementCount(); + Check(EC.isScalable() || CI->getValue().ult(EC.getFixedValue()), + "Undefined result: extractelement index out of range", &I); + } } void Lint::visitInsertElementInst(InsertElementInst &I) { if (ConstantInt *CI = dyn_cast(findValue(I.getOperand(2), - /*OffsetOk=*/false))) - Check(CI->getValue().ult( - cast(I.getType())->getNumElements()), + /*OffsetOk=*/false))) { + ElementCount EC = I.getType()->getElementCount(); + Check(EC.isScalable() || CI->getValue().ult(EC.getFixedValue()), "Undefined result: insertelement index out of range", &I); + } } void Lint::visitUnreachableInst(UnreachableInst &I) { diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll index 73100eca8835c..4bcc4dae8d837 100644 --- a/llvm/test/Analysis/Lint/scalable.ll +++ b/llvm/test/Analysis/Lint/scalable.ll @@ -1,15 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=lint < %s | FileCheck %s +; RUN: opt -S -passes=lint -disable-output < %s 2>&1 | FileCheck %s --allow-empty -; Make sure we don't crash. - -define @test() { -; CHECK-LABEL: define @test() { -; CHECK-NEXT: [[A:%.*]] = alloca , align 8 -; CHECK-NEXT: [[V:%.*]] = load , ptr [[A]], align 8 -; CHECK-NEXT: ret [[V]] -; +; CHECK-NOT: Buffer overflow +define @alloca_access() { %a = alloca %v = load , ptr %a ret %v } + +; CHECK-NOT: insertelement index out of range +define @insertelement() { + %insert = insertelement poison, half 0xH0000, i64 100 + ret %insert +} + +; CHECK-NOT: extract index out of range +define half @extractelement( %v) { + %insert = extractelement %v, i64 100 + ret half %insert +} From 6d3563422ce6f431b837221932d32db4c9681fc5 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 4 Sep 2024 11:58:07 +0100 Subject: [PATCH 073/425] include REQUIRES guard to aarch64-neon-luti.c (#107217) --- clang/test/CodeGen/aarch64-neon-luti.c | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c index 72cb6bcdb40f0..40daf742eb966 100644 --- a/clang/test/CodeGen/aarch64-neon-luti.c +++ b/clang/test/CodeGen/aarch64-neon-luti.c @@ -1,4 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: aarch64-registered-target #include // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s From cd46829e547d2d0aa3cb0ef7c9de59c507eaaecc Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Wed, 4 Sep 2024 16:28:39 +0530 Subject: [PATCH 074/425] [LV] Fix emission of debug message in legality check (#101924) Successful vectorization message is emitted even after "Result" is false. "Result" = false indicates failure of one of the legality check and thus successful message should not be printed. --- .../Vectorize/LoopVectorizationLegality.cpp | 20 ++++++++---- .../LoopVectorize/check-no-vectorize.ll | 32 +++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 66a779da8c25b..7042af6dd8eae 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1451,10 +1451,12 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { // Check whether the loop-related control flow in the loop nest is expected by // vectorizer. if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { - if (DoExtraAnalysis) + if (DoExtraAnalysis) { + LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest"); Result = false; - else + } else { return false; + } } // We need to have a loop header. @@ -1519,17 +1521,21 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } - LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" - << (LAI->getRuntimePointerChecking()->Need - ? " (with a runtime bound check)" - : "") - << "!\n"); + if (Result) { + LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" + << (LAI->getRuntimePointerChecking()->Need + ? " (with a runtime bound check)" + : "") + << "!\n"); + } unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; if (PSE.getPredicate().getComplexity() > SCEVThreshold) { + LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable " + "due to SCEVThreshold"); reportVectorizationFailure("Too many SCEV checks needed", "Too many SCEV assumptions need to be made and checked at runtime", "TooManySCEVRunTimeChecks", ORE, TheLoop); diff --git a/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll new file mode 100644 index 0000000000000..e45a204931319 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll @@ -0,0 +1,32 @@ +; This test checks that we don't emit both +; successful and unsuccessful message about vectorization. + +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -debug -disable-output < %s 2>&1 | FileCheck %s +; CHECK-NOT: LV: We can vectorize this loop +; CHECK: LV: Not vectorizing: Cannot prove legality +; CHECK-NOT: LV: We can vectorize this loop + +@a = global [32000 x i32] zeroinitializer, align 4 + +define void @foo(i32 %val1, i32 %val2) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i32 [ %val1, %entry ], [ %add1, %for.body ] + %1 = phi i32 [ %val2, %entry ], [ %2, %for.body ] + %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32000 x i32], ptr @a, i64 0, i64 %iv + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx2 = getelementptr inbounds [32000 x i32], ptr @a, i64 0, i64 %iv.next + %2 = load i32, ptr %arrayidx2, align 4 + %add0 = add nsw i32 %2, %1 + %add1 = add nsw i32 %add0, %0 + store i32 %add1, ptr %arrayidx, align 4 + %exitcond = icmp eq i64 %iv.next, 31999 + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %for.body + ret void +} From 7afdc6bd57d634354597df185fd7037bec9241ff Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 4 Sep 2024 11:09:36 +0100 Subject: [PATCH 075/425] [DAG] Fix typo in i64/i128 abdu/abds tests I'd incorrectly swapped the operands in some of the "cmp" test patterns when I changed the condition code --- llvm/test/CodeGen/AArch64/abds-neg.ll | 21 +- llvm/test/CodeGen/AArch64/abds.ll | 21 +- llvm/test/CodeGen/AArch64/abdu-neg.ll | 22 +- llvm/test/CodeGen/AArch64/abdu.ll | 22 +- llvm/test/CodeGen/RISCV/abds-neg.ll | 233 ++++++++-------- llvm/test/CodeGen/RISCV/abds.ll | 273 ++++++++++--------- llvm/test/CodeGen/RISCV/abdu-neg.ll | 366 ++++++++++++-------------- llvm/test/CodeGen/RISCV/abdu.ll | 359 +++++++++++++------------ llvm/test/CodeGen/X86/abds-neg.ll | 37 +-- llvm/test/CodeGen/X86/abds.ll | 37 ++- llvm/test/CodeGen/X86/abdu-neg.ll | 22 +- llvm/test/CodeGen/X86/abdu.ll | 89 +++---- 12 files changed, 738 insertions(+), 764 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll index d4c6a09405e0c..ac7cb1f619557 100644 --- a/llvm/test/CodeGen/AArch64/abds-neg.ll +++ b/llvm/test/CodeGen/AArch64/abds-neg.ll @@ -377,30 +377,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x9, x8, gt +; CHECK-NEXT: csel x0, x9, x8, lt ; CHECK-NEXT: ret %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbc x9, x1, x3 -; CHECK-NEXT: subs x10, x2, x0 -; CHECK-NEXT: sbc x11, x3, x1 -; CHECK-NEXT: sbcs xzr, x3, x1 -; CHECK-NEXT: csel x0, x8, x10, lt -; CHECK-NEXT: csel x1, x9, x11, lt +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: sbc x8, x1, x3 +; CHECK-NEXT: subs x9, x2, x0 +; CHECK-NEXT: sbc x10, x3, x1 +; CHECK-NEXT: subs x11, x0, x2 +; CHECK-NEXT: sbcs xzr, x1, x3 +; CHECK-NEXT: csel x0, x11, x9, lt +; CHECK-NEXT: csel x1, x8, x10, lt ; CHECK-NEXT: ret %cmp = icmp slt i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll index 45bb8749b25ed..0e35f8240848b 100644 --- a/llvm/test/CodeGen/AArch64/abds.ll +++ b/llvm/test/CodeGen/AArch64/abds.ll @@ -343,31 +343,30 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, ge +; CHECK-NEXT: csel x0, x9, x8, gt ; CHECK-NEXT: ret %cmp = icmp sge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: sbc x8, x1, x3 -; CHECK-NEXT: subs x9, x2, x0 -; CHECK-NEXT: sbc x10, x3, x1 -; CHECK-NEXT: subs x11, x0, x2 -; CHECK-NEXT: sbcs xzr, x1, x3 -; CHECK-NEXT: csel x0, x9, x11, ge -; CHECK-NEXT: csel x1, x10, x8, ge +; CHECK-NEXT: subs x8, x0, x2 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbc x11, x3, x1 +; CHECK-NEXT: sbcs xzr, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lt +; CHECK-NEXT: csel x1, x9, x11, lt ; CHECK-NEXT: ret %cmp = icmp sge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll index b148a29a72976..2118816ca7c58 100644 --- a/llvm/test/CodeGen/AArch64/abdu-neg.ll +++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll @@ -379,31 +379,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x9, x8, hi +; CHECK-NEXT: csel x0, x9, x8, lo ; CHECK-NEXT: ret %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: sbc x8, x1, x3 +; CHECK-NEXT: subs x9, x2, x0 +; CHECK-NEXT: sbc x10, x3, x1 +; CHECK-NEXT: subs x11, x0, x2 +; CHECK-NEXT: sbcs xzr, x1, x3 +; CHECK-NEXT: csel x0, x11, x9, lo +; CHECK-NEXT: csel x1, x8, x10, lo ; CHECK-NEXT: ret %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index 22d41dfb85a62..eb866e6a78a9b 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -346,31 +346,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, hs +; CHECK-NEXT: csel x0, x9, x8, hi ; CHECK-NEXT: ret %cmp = icmp uge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: sbc x8, x1, x3 -; CHECK-NEXT: subs x9, x2, x0 -; CHECK-NEXT: sbc x10, x3, x1 -; CHECK-NEXT: subs x11, x0, x2 -; CHECK-NEXT: sbcs xzr, x1, x3 -; CHECK-NEXT: csel x0, x9, x11, hs -; CHECK-NEXT: csel x1, x10, x8, hs +; CHECK-NEXT: subs x8, x0, x2 +; CHECK-NEXT: sbcs x9, x1, x3 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NEXT: eor x8, x8, x10 +; CHECK-NEXT: eor x9, x9, x10 +; CHECK-NEXT: subs x0, x8, x10 +; CHECK-NEXT: sbc x1, x9, x10 ; CHECK-NEXT: ret %cmp = icmp uge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index 058f105e8f735..168615983d970 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -1791,20 +1791,20 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: sltu a4, a2, a0 +; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: beq a1, a3, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt a5, a3, a1 +; RV32I-NEXT: slt a5, a1, a3 ; RV32I-NEXT: .LBB21_2: ; RV32I-NEXT: bnez a5, .LBB21_4 ; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: sltu a4, a2, a0 ; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB21_4: -; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 @@ -1812,7 +1812,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a1, a0, .LBB21_2 +; RV64I-NEXT: blt a0, a1, .LBB21_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret @@ -1822,20 +1822,20 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sltu a4, a2, a0 +; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: mv a5, a4 ; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt a5, a3, a1 +; RV32ZBB-NEXT: slt a5, a1, a3 ; RV32ZBB-NEXT: .LBB21_2: ; RV32ZBB-NEXT: bnez a5, .LBB21_4 ; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: sltu a4, a2, a0 ; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a0, a2, a0 ; RV32ZBB-NEXT: ret ; RV32ZBB-NEXT: .LBB21_4: -; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a0, a0, a2 @@ -1843,109 +1843,103 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: min a2, a0, a1 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: blt a0, a1, .LBB21_2 +; RV64ZBB-NEXT: # %bb.1: +; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: ret +; RV64ZBB-NEXT: .LBB21_2: +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: ret %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a2) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw a2, 12(a2) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB22_2 +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: sltu t1, a7, a6 +; RV32I-NEXT: mv t4, t1 +; RV32I-NEXT: beq t0, a2, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t0, a2 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a4, a3 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB22_4 +; RV32I-NEXT: beq a1, a5, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: sltu t3, a1, a5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: beqz t6, .LBB22_6 +; RV32I-NEXT: xor t5, t0, a2 +; RV32I-NEXT: xor t6, a7, a6 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: mv t6, t3 +; RV32I-NEXT: beqz t5, .LBB22_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t6, t4 ; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB22_8 +; RV32I-NEXT: sltu t4, a3, a4 +; RV32I-NEXT: mv t5, t4 +; RV32I-NEXT: beq a1, a5, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: sltu t5, a5, a1 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: sltu t5, a3, a5 -; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB22_10 +; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t1, a6, a7 +; RV32I-NEXT: sub a2, a2, t0 +; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sub a6, a6, a7 +; RV32I-NEXT: sltu a7, a6, t5 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a6, a6, t5 +; RV32I-NEXT: sub a5, a5, a1 +; RV32I-NEXT: sub a1, a5, t4 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: bnez t3, .LBB22_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 ; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sltu a7, a6, t3 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sub a6, a6, t3 ; RV32I-NEXT: sub a1, a1, t2 -; RV32I-NEXT: sub a4, a6, t4 -; RV32I-NEXT: j .LBB22_13 -; RV32I-NEXT: .LBB22_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 -; RV32I-NEXT: sub a4, a6, t6 -; RV32I-NEXT: .LBB22_13: -; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: .LBB22_11: +; RV32I-NEXT: sw a6, 8(a0) ; RV32I-NEXT: sw a1, 4(a0) ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a2, 12(a0) -; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: -; RV64I-NEXT: sltu a4, a2, a0 +; RV64I-NEXT: sltu a4, a0, a2 ; RV64I-NEXT: mv a5, a4 ; RV64I-NEXT: beq a1, a3, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slt a5, a3, a1 +; RV64I-NEXT: slt a5, a1, a3 ; RV64I-NEXT: .LBB22_2: ; RV64I-NEXT: bnez a5, .LBB22_4 ; RV64I-NEXT: # %bb.3: +; RV64I-NEXT: sltu a4, a2, a0 ; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: sub a0, a2, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB22_4: -; RV64I-NEXT: sltu a4, a0, a2 ; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: sub a0, a0, a2 @@ -1953,95 +1947,86 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 0(a1) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a3, 0(a2) +; RV32ZBB-NEXT: lw a4, 0(a1) +; RV32ZBB-NEXT: lw a5, 4(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw a2, 12(a2) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB22_2 +; RV32ZBB-NEXT: lw a1, 4(a1) +; RV32ZBB-NEXT: sltu t1, a7, a6 +; RV32ZBB-NEXT: mv t4, t1 +; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t0, a2 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a4, a3 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB22_4 +; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: sltu t3, a1, a5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: addi sp, sp, -16 -; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 -; RV32ZBB-NEXT: or t6, s0, t6 -; RV32ZBB-NEXT: beqz t6, .LBB22_6 +; RV32ZBB-NEXT: xor t5, t0, a2 +; RV32ZBB-NEXT: xor t6, a7, a6 +; RV32ZBB-NEXT: or t5, t6, t5 +; RV32ZBB-NEXT: mv t6, t3 +; RV32ZBB-NEXT: beqz t5, .LBB22_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t6, t4 ; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 +; RV32ZBB-NEXT: sltu t4, a3, a4 +; RV32ZBB-NEXT: mv t5, t4 +; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: sltu t5, a5, a1 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 -; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB22_10 +; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t1, a6, a7 +; RV32ZBB-NEXT: sub a2, a2, t0 +; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sub a6, a6, a7 +; RV32ZBB-NEXT: sltu a7, a6, t5 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a6, a6, t5 +; RV32ZBB-NEXT: sub a5, a5, a1 +; RV32ZBB-NEXT: sub a1, a5, t4 +; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: bnez t3, .LBB22_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 ; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 +; RV32ZBB-NEXT: sub a6, a7, a6 +; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sltu a7, a6, t3 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sub a6, a6, t3 ; RV32ZBB-NEXT: sub a1, a1, t2 -; RV32ZBB-NEXT: sub a4, a6, t4 -; RV32ZBB-NEXT: j .LBB22_13 -; RV32ZBB-NEXT: .LBB22_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 -; RV32ZBB-NEXT: sub a4, a6, t6 -; RV32ZBB-NEXT: .LBB22_13: -; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: .LBB22_11: +; RV32ZBB-NEXT: sw a6, 8(a0) ; RV32ZBB-NEXT: sw a1, 4(a0) ; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a2, 12(a0) -; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sltu a4, a2, a0 +; RV64ZBB-NEXT: sltu a4, a0, a2 ; RV64ZBB-NEXT: mv a5, a4 ; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: slt a5, a3, a1 +; RV64ZBB-NEXT: slt a5, a1, a3 ; RV64ZBB-NEXT: .LBB22_2: ; RV64ZBB-NEXT: bnez a5, .LBB22_4 ; RV64ZBB-NEXT: # %bb.3: +; RV64ZBB-NEXT: sltu a4, a2, a0 ; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: sub a0, a2, a0 ; RV64ZBB-NEXT: ret ; RV64ZBB-NEXT: .LBB22_4: -; RV64ZBB-NEXT: sltu a4, a0, a2 ; RV64ZBB-NEXT: sub a1, a1, a3 ; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: sub a0, a0, a2 @@ -2049,7 +2034,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { %cmp = icmp slt i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index b867a55445c95..86b36d8f69e95 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -1448,250 +1448,265 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a4, a2, a0 ; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: beq a1, a3, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt a5, a1, a3 +; RV32I-NEXT: slt a5, a3, a1 ; RV32I-NEXT: .LBB21_2: -; RV32I-NEXT: beqz a5, .LBB21_4 +; RV32I-NEXT: bnez a5, .LBB21_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB21_4: -; RV32I-NEXT: sltu a4, a2, a0 -; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bge a0, a1, .LBB21_2 +; RV64I-NEXT: blt a1, a0, .LBB21_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB21_2: -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sltu a4, a2, a0 ; RV32ZBB-NEXT: mv a5, a4 ; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt a5, a1, a3 +; RV32ZBB-NEXT: slt a5, a3, a1 ; RV32ZBB-NEXT: .LBB21_2: -; RV32ZBB-NEXT: beqz a5, .LBB21_4 +; RV32ZBB-NEXT: bnez a5, .LBB21_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: sub a0, a2, a0 ; RV32ZBB-NEXT: ret ; RV32ZBB-NEXT: .LBB21_4: -; RV32ZBB-NEXT: sltu a4, a2, a0 -; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: sub a0, a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: bge a0, a1, .LBB21_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sub a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB21_2: -; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: min a2, a0, a1 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp sge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a7, 8(a2) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a6 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a2, .LBB22_2 +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: sltu a2, a7, a6 +; RV32I-NEXT: mv t4, a2 +; RV32I-NEXT: beq t0, t1, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t0, a2 +; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: xor t3, t0, a2 -; RV32I-NEXT: xor t5, a7, a6 -; RV32I-NEXT: sltu t2, a4, a3 -; RV32I-NEXT: or t5, t5, t3 +; RV32I-NEXT: sltu t2, a5, a3 +; RV32I-NEXT: sltu t5, a1, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a5, .LBB22_4 +; RV32I-NEXT: beq a4, a1, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: mv t6, t3 -; RV32I-NEXT: beqz t5, .LBB22_6 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: xor t6, t0, t1 +; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: beqz t6, .LBB22_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t6, t4 +; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: sltu t4, a3, a4 -; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a5, .LBB22_8 +; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: beq a1, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a5, a1 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: beqz t6, .LBB22_10 +; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: mv t6, t5 +; RV32I-NEXT: beq a4, a1, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sub a2, t0, a2 +; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: .LBB22_10: +; RV32I-NEXT: bnez t3, .LBB22_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a2, t0, a2 +; RV32I-NEXT: sltu a7, a6, t4 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t3 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a1, a1, t2 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: j .LBB22_11 -; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sltu t1, a6, a7 -; RV32I-NEXT: sub a2, a2, t0 -; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: j .LBB22_13 +; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: sltu a2, a6, a7 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t5 +; RV32I-NEXT: sltu a7, a6, t6 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t5 -; RV32I-NEXT: sub a5, a5, a1 -; RV32I-NEXT: sub a1, a5, t4 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: .LBB22_11: -; RV32I-NEXT: sw a6, 8(a0) +; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: sub a4, a4, a1 +; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: .LBB22_13: +; RV32I-NEXT: sw a4, 8(a0) ; RV32I-NEXT: sw a1, 4(a0) ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: -; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sltu a4, a2, a0 ; RV64I-NEXT: mv a5, a4 ; RV64I-NEXT: beq a1, a3, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slt a5, a1, a3 +; RV64I-NEXT: slt a5, a3, a1 ; RV64I-NEXT: .LBB22_2: -; RV64I-NEXT: beqz a5, .LBB22_4 +; RV64I-NEXT: bnez a5, .LBB22_4 ; RV64I-NEXT: # %bb.3: -; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a0, a2, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB22_4: -; RV64I-NEXT: sltu a4, a2, a0 -; RV64I-NEXT: sub a1, a3, a1 +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw a2, 12(a2) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a4, 4(a1) +; RV32ZBB-NEXT: lw a6, 8(a1) +; RV32ZBB-NEXT: lw a7, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a6 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: sltu a2, a7, a6 +; RV32ZBB-NEXT: mv t4, a2 +; RV32ZBB-NEXT: beq t0, t1, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t0, a2 +; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: xor t3, t0, a2 -; RV32ZBB-NEXT: xor t5, a7, a6 -; RV32ZBB-NEXT: sltu t2, a4, a3 -; RV32ZBB-NEXT: or t5, t5, t3 +; RV32ZBB-NEXT: sltu t2, a5, a3 +; RV32ZBB-NEXT: sltu t5, a1, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 +; RV32ZBB-NEXT: beq a4, a1, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: mv t6, t3 -; RV32ZBB-NEXT: beqz t5, .LBB22_6 +; RV32ZBB-NEXT: addi sp, sp, -16 +; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: xor t6, t0, t1 +; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: or t6, s0, t6 +; RV32ZBB-NEXT: beqz t6, .LBB22_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t6, t4 +; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: sltu t4, a3, a4 -; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 +; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a5, a1 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: beqz t6, .LBB22_10 +; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: mv t6, t5 +; RV32ZBB-NEXT: beq a4, a1, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: .LBB22_10: +; RV32ZBB-NEXT: bnez t3, .LBB22_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a5 +; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sltu a7, a6, t4 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a1, a1, t2 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: j .LBB22_11 -; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sltu t1, a6, a7 -; RV32ZBB-NEXT: sub a2, a2, t0 -; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: j .LBB22_13 +; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: sltu a2, a6, a7 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t5 +; RV32ZBB-NEXT: sltu a7, a6, t6 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t5 -; RV32ZBB-NEXT: sub a5, a5, a1 -; RV32ZBB-NEXT: sub a1, a5, t4 -; RV32ZBB-NEXT: sub a3, a3, a4 -; RV32ZBB-NEXT: .LBB22_11: -; RV32ZBB-NEXT: sw a6, 8(a0) +; RV32ZBB-NEXT: sub a3, a3, a5 +; RV32ZBB-NEXT: sub a4, a4, a1 +; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: .LBB22_13: +; RV32ZBB-NEXT: sw a4, 8(a0) ; RV32ZBB-NEXT: sw a1, 4(a0) ; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sltu a4, a2, a0 ; RV64ZBB-NEXT: mv a5, a4 ; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: slt a5, a1, a3 +; RV64ZBB-NEXT: slt a5, a3, a1 ; RV64ZBB-NEXT: .LBB22_2: -; RV64ZBB-NEXT: beqz a5, .LBB22_4 +; RV64ZBB-NEXT: bnez a5, .LBB22_4 ; RV64ZBB-NEXT: # %bb.3: -; RV64ZBB-NEXT: sub a1, a1, a3 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: sub a0, a2, a0 ; RV64ZBB-NEXT: ret ; RV64ZBB-NEXT: .LBB22_4: -; RV64ZBB-NEXT: sltu a4, a2, a0 -; RV64ZBB-NEXT: sub a1, a3, a1 +; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sub a1, a1, a3 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp sge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index bcacdf44ab103..87a06fc4403eb 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -1597,17 +1597,6 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { -; CHECK-LABEL: abd_cmp_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: andi a2, a0, 255 -; CHECK-NEXT: andi a3, a1, 255 -; CHECK-NEXT: bgeu a3, a2, .LBB18_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: sub a0, a1, a0 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB18_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: ret ; NOZBB-LABEL: abd_cmp_i8: ; NOZBB: # %bb.0: ; NOZBB-NEXT: andi a2, a0, 255 @@ -1740,28 +1729,27 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: sub a3, a1, a3 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: sub a2, a0, a2 -; RV32I-NEXT: beq a3, a1, .LBB21_2 +; RV32I-NEXT: mv a5, a4 +; RV32I-NEXT: beq a1, a3, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a0, a1, a3 -; RV32I-NEXT: j .LBB21_3 +; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: .LBB21_2: -; RV32I-NEXT: sltu a0, a0, a2 -; RV32I-NEXT: .LBB21_3: -; RV32I-NEXT: neg a1, a0 -; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 +; RV32I-NEXT: bnez a5, .LBB21_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: sltu a4, a2, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB21_4: +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a1, a0, .LBB21_2 +; RV64I-NEXT: bltu a0, a1, .LBB21_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret @@ -1772,234 +1760,218 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: sltu a4, a0, a2 -; RV32ZBB-NEXT: sub a3, a1, a3 -; RV32ZBB-NEXT: sub a3, a3, a4 -; RV32ZBB-NEXT: sub a2, a0, a2 -; RV32ZBB-NEXT: beq a3, a1, .LBB21_2 +; RV32ZBB-NEXT: mv a5, a4 +; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a0, a1, a3 -; RV32ZBB-NEXT: j .LBB21_3 +; RV32ZBB-NEXT: sltu a5, a1, a3 ; RV32ZBB-NEXT: .LBB21_2: -; RV32ZBB-NEXT: sltu a0, a0, a2 -; RV32ZBB-NEXT: .LBB21_3: -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 +; RV32ZBB-NEXT: bnez a5, .LBB21_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: sltu a4, a2, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: ret +; RV32ZBB-NEXT: .LBB21_4: +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sub a0, a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: minu a2, a0, a1 -; RV64ZBB-NEXT: maxu a0, a0, a1 -; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: bltu a0, a1, .LBB21_2 +; RV64ZBB-NEXT: # %bb.1: +; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: ret +; RV64ZBB-NEXT: .LBB21_2: +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: ret %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw t0, 4(a2) +; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a2) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: lw t0, 12(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu a2, a4, a7 -; RV32I-NEXT: sub t1, a6, t1 -; RV32I-NEXT: sltu t2, a3, a5 -; RV32I-NEXT: sub a2, t1, a2 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, t0, .LBB22_2 +; RV32I-NEXT: sltu t1, a7, a6 +; RV32I-NEXT: mv t4, t1 +; RV32I-NEXT: beq t0, a2, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, t0 +; RV32I-NEXT: sltu t4, t0, a2 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t3, a7, t1 -; RV32I-NEXT: sub a2, a2, t3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: beq a2, a6, .LBB22_4 +; RV32I-NEXT: sltu t2, a4, a3 +; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: beq a1, a5, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a6, a2 -; RV32I-NEXT: j .LBB22_5 +; RV32I-NEXT: sltu t3, a1, a5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: sltu t1, a4, a7 -; RV32I-NEXT: .LBB22_5: -; RV32I-NEXT: sub t0, a1, t0 -; RV32I-NEXT: sub t0, t0, t2 -; RV32I-NEXT: sub a5, a3, a5 -; RV32I-NEXT: beq t0, a1, .LBB22_7 -; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, t0 -; RV32I-NEXT: j .LBB22_8 -; RV32I-NEXT: .LBB22_7: -; RV32I-NEXT: sltu a1, a3, a5 +; RV32I-NEXT: xor t5, t0, a2 +; RV32I-NEXT: xor t6, a7, a6 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: mv t6, t3 +; RV32I-NEXT: beqz t5, .LBB22_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: mv t6, t4 +; RV32I-NEXT: .LBB22_6: +; RV32I-NEXT: sltu t4, a3, a4 +; RV32I-NEXT: mv t5, t4 +; RV32I-NEXT: beq a1, a5, .LBB22_8 +; RV32I-NEXT: # %bb.7: +; RV32I-NEXT: sltu t5, a5, a1 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: xor a3, a2, a6 -; RV32I-NEXT: xor a4, a7, a4 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: beqz a3, .LBB22_10 +; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: sltu t1, a6, a7 +; RV32I-NEXT: sub a2, a2, t0 +; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sub a6, a6, a7 +; RV32I-NEXT: sltu a7, a6, t5 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a6, a6, t5 +; RV32I-NEXT: sub a5, a5, a1 +; RV32I-NEXT: sub a1, a5, t4 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: neg a6, a1 -; RV32I-NEXT: xor a3, a7, a6 -; RV32I-NEXT: sltu a4, a3, a6 -; RV32I-NEXT: xor a2, a2, a6 -; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sub a4, a2, a4 -; RV32I-NEXT: xor a2, a5, a6 -; RV32I-NEXT: sltu a5, a2, a6 -; RV32I-NEXT: xor a7, t0, a6 -; RV32I-NEXT: mv t1, a5 -; RV32I-NEXT: beqz t0, .LBB22_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, a6 -; RV32I-NEXT: .LBB22_12: -; RV32I-NEXT: add a3, a3, a1 -; RV32I-NEXT: sltu a6, a3, t1 -; RV32I-NEXT: sub a4, a4, a6 -; RV32I-NEXT: sub a3, a3, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: sw a1, 0(a0) -; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a3, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sub a2, t0, a2 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sltu a7, a6, t3 +; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a6, a6, t3 +; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: .LBB22_11: +; RV32I-NEXT: sw a6, 8(a0) +; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: ; RV64I-NEXT: sltu a4, a0, a2 -; RV64I-NEXT: sub a3, a1, a3 -; RV64I-NEXT: sub a3, a3, a4 -; RV64I-NEXT: sub a2, a0, a2 -; RV64I-NEXT: beq a3, a1, .LBB22_2 +; RV64I-NEXT: mv a5, a4 +; RV64I-NEXT: beq a1, a3, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sltu a0, a1, a3 -; RV64I-NEXT: j .LBB22_3 +; RV64I-NEXT: sltu a5, a1, a3 ; RV64I-NEXT: .LBB22_2: -; RV64I-NEXT: sltu a0, a0, a2 -; RV64I-NEXT: .LBB22_3: -; RV64I-NEXT: neg a1, a0 -; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: bnez a5, .LBB22_4 +; RV64I-NEXT: # %bb.3: +; RV64I-NEXT: sltu a4, a2, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB22_4: +; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a3, 0(a1) -; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a6, 12(a1) -; RV32ZBB-NEXT: lw t0, 4(a2) +; RV32ZBB-NEXT: lw a3, 0(a2) +; RV32ZBB-NEXT: lw a4, 0(a1) +; RV32ZBB-NEXT: lw a5, 4(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw a2, 12(a2) +; RV32ZBB-NEXT: lw t0, 12(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu a2, a4, a7 -; RV32ZBB-NEXT: sub t1, a6, t1 -; RV32ZBB-NEXT: sltu t2, a3, a5 -; RV32ZBB-NEXT: sub a2, t1, a2 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, t0, .LBB22_2 +; RV32ZBB-NEXT: sltu t1, a7, a6 +; RV32ZBB-NEXT: mv t4, t1 +; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, t0 +; RV32ZBB-NEXT: sltu t4, t0, a2 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t3, a7, t1 -; RV32ZBB-NEXT: sub a2, a2, t3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: beq a2, a6, .LBB22_4 +; RV32ZBB-NEXT: sltu t2, a4, a3 +; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a6, a2 -; RV32ZBB-NEXT: j .LBB22_5 +; RV32ZBB-NEXT: sltu t3, a1, a5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: sltu t1, a4, a7 -; RV32ZBB-NEXT: .LBB22_5: -; RV32ZBB-NEXT: sub t0, a1, t0 -; RV32ZBB-NEXT: sub t0, t0, t2 -; RV32ZBB-NEXT: sub a5, a3, a5 -; RV32ZBB-NEXT: beq t0, a1, .LBB22_7 -; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, t0 -; RV32ZBB-NEXT: j .LBB22_8 -; RV32ZBB-NEXT: .LBB22_7: -; RV32ZBB-NEXT: sltu a1, a3, a5 +; RV32ZBB-NEXT: xor t5, t0, a2 +; RV32ZBB-NEXT: xor t6, a7, a6 +; RV32ZBB-NEXT: or t5, t6, t5 +; RV32ZBB-NEXT: mv t6, t3 +; RV32ZBB-NEXT: beqz t5, .LBB22_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: mv t6, t4 +; RV32ZBB-NEXT: .LBB22_6: +; RV32ZBB-NEXT: sltu t4, a3, a4 +; RV32ZBB-NEXT: mv t5, t4 +; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 +; RV32ZBB-NEXT: # %bb.7: +; RV32ZBB-NEXT: sltu t5, a5, a1 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: xor a3, a2, a6 -; RV32ZBB-NEXT: xor a4, a7, a4 -; RV32ZBB-NEXT: or a3, a4, a3 -; RV32ZBB-NEXT: beqz a3, .LBB22_10 +; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: sltu t1, a6, a7 +; RV32ZBB-NEXT: sub a2, a2, t0 +; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sub a6, a6, a7 +; RV32ZBB-NEXT: sltu a7, a6, t5 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a6, a6, t5 +; RV32ZBB-NEXT: sub a5, a5, a1 +; RV32ZBB-NEXT: sub a1, a5, t4 +; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: neg a6, a1 -; RV32ZBB-NEXT: xor a3, a7, a6 -; RV32ZBB-NEXT: sltu a4, a3, a6 -; RV32ZBB-NEXT: xor a2, a2, a6 -; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sub a4, a2, a4 -; RV32ZBB-NEXT: xor a2, a5, a6 -; RV32ZBB-NEXT: sltu a5, a2, a6 -; RV32ZBB-NEXT: xor a7, t0, a6 -; RV32ZBB-NEXT: mv t1, a5 -; RV32ZBB-NEXT: beqz t0, .LBB22_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, a6 -; RV32ZBB-NEXT: .LBB22_12: -; RV32ZBB-NEXT: add a3, a3, a1 -; RV32ZBB-NEXT: sltu a6, a3, t1 -; RV32ZBB-NEXT: sub a4, a4, a6 -; RV32ZBB-NEXT: sub a3, a3, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: sw a1, 0(a0) -; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a3, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sub a6, a7, a6 +; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sltu a7, a6, t3 +; RV32ZBB-NEXT: sub a1, a1, a5 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a6, a6, t3 +; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: .LBB22_11: +; RV32ZBB-NEXT: sw a6, 8(a0) +; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sltu a4, a0, a2 -; RV64ZBB-NEXT: sub a3, a1, a3 -; RV64ZBB-NEXT: sub a3, a3, a4 -; RV64ZBB-NEXT: sub a2, a0, a2 -; RV64ZBB-NEXT: beq a3, a1, .LBB22_2 +; RV64ZBB-NEXT: mv a5, a4 +; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sltu a0, a1, a3 -; RV64ZBB-NEXT: j .LBB22_3 +; RV64ZBB-NEXT: sltu a5, a1, a3 ; RV64ZBB-NEXT: .LBB22_2: -; RV64ZBB-NEXT: sltu a0, a0, a2 -; RV64ZBB-NEXT: .LBB22_3: -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 +; RV64ZBB-NEXT: bnez a5, .LBB22_4 +; RV64ZBB-NEXT: # %bb.3: +; RV64ZBB-NEXT: sltu a4, a2, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: add a0, a2, a0 +; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: ret +; RV64ZBB-NEXT: .LBB22_4: +; RV64ZBB-NEXT: sub a1, a1, a3 +; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll index 39aef369a2967..14f45895754df 100644 --- a/llvm/test/CodeGen/RISCV/abdu.ll +++ b/llvm/test/CodeGen/RISCV/abdu.ll @@ -1457,249 +1457,266 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: mv a5, a4 -; RV32I-NEXT: beq a1, a3, .LBB21_2 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sub a2, a0, a2 +; RV32I-NEXT: beq a3, a1, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a5, a1, a3 +; RV32I-NEXT: sltu a0, a1, a3 +; RV32I-NEXT: j .LBB21_3 ; RV32I-NEXT: .LBB21_2: -; RV32I-NEXT: beqz a5, .LBB21_4 -; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB21_4: -; RV32I-NEXT: sltu a4, a2, a0 -; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sltu a0, a0, a2 +; RV32I-NEXT: .LBB21_3: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: xor a2, a2, a1 +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: xor a1, a3, a1 +; RV32I-NEXT: add a1, a1, a0 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bgeu a0, a1, .LBB21_2 +; RV64I-NEXT: bltu a1, a0, .LBB21_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB21_2: -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: sltu a4, a0, a2 -; RV32ZBB-NEXT: mv a5, a4 -; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sub a2, a0, a2 +; RV32ZBB-NEXT: beq a3, a1, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a5, a1, a3 +; RV32ZBB-NEXT: sltu a0, a1, a3 +; RV32ZBB-NEXT: j .LBB21_3 ; RV32ZBB-NEXT: .LBB21_2: -; RV32ZBB-NEXT: beqz a5, .LBB21_4 -; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a0, a2 -; RV32ZBB-NEXT: ret -; RV32ZBB-NEXT: .LBB21_4: -; RV32ZBB-NEXT: sltu a4, a2, a0 -; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sltu a0, a0, a2 +; RV32ZBB-NEXT: .LBB21_3: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: xor a2, a2, a1 +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: xor a1, a3, a1 +; RV32ZBB-NEXT: add a1, a1, a0 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: bgeu a0, a1, .LBB21_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sub a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB21_2: -; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: minu a2, a0, a1 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp uge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw t0, 4(a2) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a6 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a2, .LBB22_2 +; RV32I-NEXT: sltu a2, a4, a7 +; RV32I-NEXT: sub t1, a6, t1 +; RV32I-NEXT: sltu t2, a3, a5 +; RV32I-NEXT: sub a2, t1, a2 +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: beq a1, t0, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t4, t0, a2 +; RV32I-NEXT: sltu t1, a1, t0 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: xor t3, t0, a2 -; RV32I-NEXT: xor t5, a7, a6 -; RV32I-NEXT: sltu t2, a4, a3 -; RV32I-NEXT: or t5, t5, t3 -; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a5, .LBB22_4 +; RV32I-NEXT: sub a7, a4, a7 +; RV32I-NEXT: sltu t3, a7, t1 +; RV32I-NEXT: sub a2, a2, t3 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: beq a2, a6, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a5 +; RV32I-NEXT: sltu t1, a6, a2 +; RV32I-NEXT: j .LBB22_5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: mv t6, t3 -; RV32I-NEXT: beqz t5, .LBB22_6 -; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t6, t4 -; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: sltu t4, a3, a4 -; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a5, .LBB22_8 -; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a5, a1 +; RV32I-NEXT: sltu t1, a4, a7 +; RV32I-NEXT: .LBB22_5: +; RV32I-NEXT: sub t0, a1, t0 +; RV32I-NEXT: sub t0, t0, t2 +; RV32I-NEXT: sub a5, a3, a5 +; RV32I-NEXT: beq t0, a1, .LBB22_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: sltu a1, a1, t0 +; RV32I-NEXT: j .LBB22_8 +; RV32I-NEXT: .LBB22_7: +; RV32I-NEXT: sltu a1, a3, a5 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: beqz t6, .LBB22_10 +; RV32I-NEXT: xor a3, a2, a6 +; RV32I-NEXT: xor a4, a7, a4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: beqz a3, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t3 -; RV32I-NEXT: sub a1, a1, t2 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: j .LBB22_11 +; RV32I-NEXT: mv a1, t1 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sltu t1, a6, a7 -; RV32I-NEXT: sub a2, a2, t0 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t5 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t5 -; RV32I-NEXT: sub a5, a5, a1 -; RV32I-NEXT: sub a1, a5, t4 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: .LBB22_11: -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: neg a6, a1 +; RV32I-NEXT: xor a3, a7, a6 +; RV32I-NEXT: sltu a4, a3, a6 +; RV32I-NEXT: xor a2, a2, a6 +; RV32I-NEXT: add a2, a2, a1 +; RV32I-NEXT: sub a4, a2, a4 +; RV32I-NEXT: xor a2, a5, a6 +; RV32I-NEXT: sltu a5, a2, a6 +; RV32I-NEXT: xor a7, t0, a6 +; RV32I-NEXT: mv t1, a5 +; RV32I-NEXT: beqz t0, .LBB22_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a7, a6 +; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: add a3, a3, a1 +; RV32I-NEXT: sltu a6, a3, t1 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: add a7, a7, a1 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: ; RV64I-NEXT: sltu a4, a0, a2 -; RV64I-NEXT: mv a5, a4 -; RV64I-NEXT: beq a1, a3, .LBB22_2 +; RV64I-NEXT: sub a3, a1, a3 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: sub a2, a0, a2 +; RV64I-NEXT: beq a3, a1, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sltu a5, a1, a3 +; RV64I-NEXT: sltu a0, a1, a3 +; RV64I-NEXT: j .LBB22_3 ; RV64I-NEXT: .LBB22_2: -; RV64I-NEXT: beqz a5, .LBB22_4 -; RV64I-NEXT: # %bb.3: -; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a0, a2 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB22_4: -; RV64I-NEXT: sltu a4, a2, a0 -; RV64I-NEXT: sub a1, a3, a1 +; RV64I-NEXT: sltu a0, a0, a2 +; RV64I-NEXT: .LBB22_3: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: sltu a4, a2, a1 +; RV64I-NEXT: xor a1, a3, a1 +; RV64I-NEXT: add a1, a1, a0 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: lw t0, 4(a2) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a6 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 +; RV32ZBB-NEXT: sltu a2, a4, a7 +; RV32ZBB-NEXT: sub t1, a6, t1 +; RV32ZBB-NEXT: sltu t2, a3, a5 +; RV32ZBB-NEXT: sub a2, t1, a2 +; RV32ZBB-NEXT: mv t1, t2 +; RV32ZBB-NEXT: beq a1, t0, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t4, t0, a2 +; RV32ZBB-NEXT: sltu t1, a1, t0 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: xor t3, t0, a2 -; RV32ZBB-NEXT: xor t5, a7, a6 -; RV32ZBB-NEXT: sltu t2, a4, a3 -; RV32ZBB-NEXT: or t5, t5, t3 -; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 +; RV32ZBB-NEXT: sub a7, a4, a7 +; RV32ZBB-NEXT: sltu t3, a7, t1 +; RV32ZBB-NEXT: sub a2, a2, t3 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: beq a2, a6, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a5 +; RV32ZBB-NEXT: sltu t1, a6, a2 +; RV32ZBB-NEXT: j .LBB22_5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: mv t6, t3 -; RV32ZBB-NEXT: beqz t5, .LBB22_6 -; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t6, t4 -; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: sltu t4, a3, a4 -; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 -; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a5, a1 +; RV32ZBB-NEXT: sltu t1, a4, a7 +; RV32ZBB-NEXT: .LBB22_5: +; RV32ZBB-NEXT: sub t0, a1, t0 +; RV32ZBB-NEXT: sub t0, t0, t2 +; RV32ZBB-NEXT: sub a5, a3, a5 +; RV32ZBB-NEXT: beq t0, a1, .LBB22_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: sltu a1, a1, t0 +; RV32ZBB-NEXT: j .LBB22_8 +; RV32ZBB-NEXT: .LBB22_7: +; RV32ZBB-NEXT: sltu a1, a3, a5 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: beqz t6, .LBB22_10 +; RV32ZBB-NEXT: xor a3, a2, a6 +; RV32ZBB-NEXT: xor a4, a7, a4 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: beqz a3, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, t2 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: j .LBB22_11 +; RV32ZBB-NEXT: mv a1, t1 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sltu t1, a6, a7 -; RV32ZBB-NEXT: sub a2, a2, t0 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t5 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t5 -; RV32ZBB-NEXT: sub a5, a5, a1 -; RV32ZBB-NEXT: sub a1, a5, t4 -; RV32ZBB-NEXT: sub a3, a3, a4 -; RV32ZBB-NEXT: .LBB22_11: -; RV32ZBB-NEXT: sw a6, 8(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: neg a6, a1 +; RV32ZBB-NEXT: xor a3, a7, a6 +; RV32ZBB-NEXT: sltu a4, a3, a6 +; RV32ZBB-NEXT: xor a2, a2, a6 +; RV32ZBB-NEXT: add a2, a2, a1 +; RV32ZBB-NEXT: sub a4, a2, a4 +; RV32ZBB-NEXT: xor a2, a5, a6 +; RV32ZBB-NEXT: sltu a5, a2, a6 +; RV32ZBB-NEXT: xor a7, t0, a6 +; RV32ZBB-NEXT: mv t1, a5 +; RV32ZBB-NEXT: beqz t0, .LBB22_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a7, a6 +; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: add a3, a3, a1 +; RV32ZBB-NEXT: sltu a6, a3, t1 +; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: sub a3, a3, t1 +; RV32ZBB-NEXT: add a7, a7, a1 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a5, 4(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sltu a4, a0, a2 -; RV64ZBB-NEXT: mv a5, a4 -; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 +; RV64ZBB-NEXT: sub a3, a1, a3 +; RV64ZBB-NEXT: sub a3, a3, a4 +; RV64ZBB-NEXT: sub a2, a0, a2 +; RV64ZBB-NEXT: beq a3, a1, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sltu a5, a1, a3 +; RV64ZBB-NEXT: sltu a0, a1, a3 +; RV64ZBB-NEXT: j .LBB22_3 ; RV64ZBB-NEXT: .LBB22_2: -; RV64ZBB-NEXT: beqz a5, .LBB22_4 -; RV64ZBB-NEXT: # %bb.3: -; RV64ZBB-NEXT: sub a1, a1, a3 -; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a0, a2 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB22_4: -; RV64ZBB-NEXT: sltu a4, a2, a0 -; RV64ZBB-NEXT: sub a1, a3, a1 +; RV64ZBB-NEXT: sltu a0, a0, a2 +; RV64ZBB-NEXT: .LBB22_3: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: xor a2, a2, a1 +; RV64ZBB-NEXT: sltu a4, a2, a1 +; RV64ZBB-NEXT: xor a1, a3, a1 +; RV64ZBB-NEXT: add a1, a1, a0 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %cmp = icmp uge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index 246cd8e0e852d..833273dc98243 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -842,8 +842,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: cmovll %edi, %eax -; X86-NEXT: cmovll %ebx, %edx +; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: cmovgel %ebx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -853,13 +853,14 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax -; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax +; X64-NEXT: negq %rax +; X64-NEXT: subq %rsi, %rdi +; X64-NEXT: cmovlq %rdi, %rax ; X64-NEXT: retq %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } @@ -888,10 +889,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: cmovgel (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovgel %ebx, %esi +; X86-NEXT: cmovgel %ebp, %ecx +; X86-NEXT: cmovgel %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) @@ -906,20 +907,20 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; ; X64-LABEL: abd_cmp_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: subq %rdx, %rax -; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: sbbq %rcx, %r8 -; X64-NEXT: subq %rdi, %rdx -; X64-NEXT: sbbq %rsi, %rcx -; X64-NEXT: cmovgeq %rdx, %rax -; X64-NEXT: cmovgeq %rcx, %r8 +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: sbbq %rsi, %r8 +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: sbbq %rcx, %rsi +; X64-NEXT: cmovlq %rdi, %rax +; X64-NEXT: cmovlq %rsi, %r8 ; X64-NEXT: movq %r8, %rdx ; X64-NEXT: retq %cmp = icmp slt i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index 9f3b99b349aed..d9ba140032b31 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -744,8 +744,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: cmovgel %edi, %eax -; X86-NEXT: cmovgel %ebx, %edx +; X86-NEXT: cmovll %edi, %eax +; X86-NEXT: cmovll %ebx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -755,14 +755,13 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax -; X64-NEXT: negq %rax -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: cmovlq %rdi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %cmp = icmp sge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } @@ -791,10 +790,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovgel (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovgel %ebx, %esi -; X86-NEXT: cmovgel %ebp, %ecx -; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %ebx, %esi +; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) @@ -809,20 +808,20 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; ; X64-LABEL: abd_cmp_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: subq %rdi, %rax -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: sbbq %rsi, %r8 -; X64-NEXT: subq %rdx, %rdi -; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: cmovlq %rdi, %rax -; X64-NEXT: cmovlq %rsi, %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: subq %rdx, %rax +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: sbbq %rcx, %r8 +; X64-NEXT: subq %rdi, %rdx +; X64-NEXT: sbbq %rsi, %rcx +; X64-NEXT: cmovgeq %rdx, %rax +; X64-NEXT: cmovgeq %rcx, %r8 ; X64-NEXT: movq %r8, %rdx ; X64-NEXT: retq %cmp = icmp sge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll index 9cb3e5e8bf0c2..507f7681400ef 100644 --- a/llvm/test/CodeGen/X86/abdu-neg.ll +++ b/llvm/test/CodeGen/X86/abdu-neg.ll @@ -824,8 +824,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: cmovbl %edi, %eax -; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: cmovael %edi, %eax +; X86-NEXT: cmovael %ebx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -837,12 +837,12 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: negq %rax ; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: cmovaeq %rdi, %rax +; X64-NEXT: cmovbq %rdi, %rax ; X64-NEXT: retq %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } @@ -871,10 +871,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: cmovbl %ebp, %ecx -; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmovael (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovael %ebx, %esi +; X86-NEXT: cmovael %ebp, %ecx +; X86-NEXT: cmovael %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) @@ -895,14 +895,14 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X64-NEXT: sbbq %rsi, %r8 ; X64-NEXT: subq %rdx, %rdi ; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: cmovaeq %rdi, %rax -; X64-NEXT: cmovaeq %rsi, %r8 +; X64-NEXT: cmovbq %rdi, %rax +; X64-NEXT: cmovbq %rsi, %r8 ; X64-NEXT: movq %r8, %rdx ; X64-NEXT: retq %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index c8fa19cb661b6..290894d2712e8 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -695,98 +695,83 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_cmp_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: cmovael %edi, %eax -; X86-NEXT: cmovael %ebx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax -; X64-NEXT: negq %rax -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: cmovbq %rdi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovaeq %rsi, %rax ; X64-NEXT: retq %cmp = icmp uge i64 %a, %b %ab = sub i64 %a, %b %ba = sub i64 %b, %a - %sel = select i1 %cmp, i64 %ba, i64 %ab + %sel = select i1 %cmp, i64 %ab, i64 %ba ret i64 %sel } define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_cmp_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovael (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovael %ebx, %esi -; X86-NEXT: cmovael %ebp, %ecx -; X86-NEXT: cmovael %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: subl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_cmp_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: subq %rdi, %rax -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: sbbq %rsi, %r8 -; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorl %edi, %edi +; X64-NEXT: subq %rdx, %rax ; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: cmovbq %rdi, %rax -; X64-NEXT: cmovbq %rsi, %r8 -; X64-NEXT: movq %r8, %rdx +; X64-NEXT: sbbq %rdi, %rdi +; X64-NEXT: xorq %rdi, %rsi +; X64-NEXT: xorq %rdi, %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: sbbq %rdi, %rsi +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %cmp = icmp uge i128 %a, %b %ab = sub i128 %a, %b %ba = sub i128 %b, %a - %sel = select i1 %cmp, i128 %ba, i128 %ab + %sel = select i1 %cmp, i128 %ab, i128 %ba ret i128 %sel } From b25b9a7d6c872e42121aa024f362fae0b15dd72c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 4 Sep 2024 11:58:15 +0100 Subject: [PATCH 076/425] [DAG] visitSELECT - add "select usubo(x, y).overflow, (sub y, x), (usubo x, y) -> abdu(x, y)" fold (and neg equivalent) Handle cases where CGP has merged the CMP+SUB into a USUBO node - improves a few outstanding niggles from #100810 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 ++++ llvm/test/CodeGen/X86/abdu-neg.ll | 111 +++++++++--------- llvm/test/CodeGen/X86/abdu.ll | 32 ++--- 3 files changed, 89 insertions(+), 75 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b0a906743f29f..6390231341f96 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11719,6 +11719,24 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { N2_2, Flags); } } + + // select usubo(x, y).overflow, (sub y, x), (usubo x, y) -> abdu(x, y) + if (N0.getOpcode() == ISD::USUBO && N0.getResNo() == 1 && + N2.getNode() == N0.getNode() && N2.getResNo() == 0 && + N1.getOpcode() == ISD::SUB && N2.getOperand(0) == N1.getOperand(1) && + N2.getOperand(1) == N1.getOperand(0) && + (!LegalOperations || TLI.isOperationLegal(ISD::ABDU, VT))) + return DAG.getNode(ISD::ABDU, DL, VT, N0.getOperand(0), N0.getOperand(1)); + + // select usubo(x, y).overflow, (usubo x, y), (sub y, x) -> neg (abdu x, y) + if (N0.getOpcode() == ISD::USUBO && N0.getResNo() == 1 && + N1.getNode() == N0.getNode() && N1.getResNo() == 0 && + N2.getOpcode() == ISD::SUB && N2.getOperand(0) == N1.getOperand(1) && + N2.getOperand(1) == N1.getOperand(0) && + (!LegalOperations || TLI.isOperationLegal(ISD::ABDU, VT))) + return DAG.getNegative( + DAG.getNode(ISD::ABDU, DL, VT, N0.getOperand(0), N0.getOperand(1)), + DL, VT); } // Fold selects based on a setcc into other things, such as min/max/abs. @@ -11776,6 +11794,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { return SelectNode; } + if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL)) + return ABD; + if (SDValue NewSel = SimplifySelect(DL, N0, N1, N2)) return NewSel; } diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll index 507f7681400ef..24962be43b5cf 100644 --- a/llvm/test/CodeGen/X86/abdu-neg.ll +++ b/llvm/test/CodeGen/X86/abdu-neg.ll @@ -751,27 +751,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; X86-LABEL: abd_cmp_i16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: subw %dx, %si -; X86-NEXT: movl %esi, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: cmovbl %esi, %eax +; X86-NEXT: cmovnsl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subw %si, %cx +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: subl %eax, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: negl %eax -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovnsl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %cmp = icmp ult i16 %a, %b @@ -811,33 +807,30 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_cmp_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: cmovael %edi, %eax -; X86-NEXT: cmovael %ebx, %edx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: negl %eax +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovaeq %rsi, %rax ; X64-NEXT: negq %rax -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: cmovbq %rdi, %rax ; X64-NEXT: retq %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b @@ -853,34 +846,36 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovael (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovael %ebx, %esi -; X86-NEXT: cmovael %ebp, %ecx -; X86-NEXT: cmovael %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: xorl %ebp, %ecx +; X86-NEXT: xorl %ebp, %esi +; X86-NEXT: xorl %ebp, %ebx +; X86-NEXT: xorl %ebp, %edx +; X86-NEXT: subl %ebp, %edx +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: negl %edx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: sbbl %ecx, %edi ; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: movl %ebp, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -889,15 +884,19 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; ; X64-LABEL: abd_cmp_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: subq %rdi, %rax -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: sbbq %rsi, %r8 -; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorl %edi, %edi +; X64-NEXT: subq %rdx, %rax ; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: cmovbq %rdi, %rax -; X64-NEXT: cmovbq %rsi, %r8 -; X64-NEXT: movq %r8, %rdx +; X64-NEXT: movl $0, %ecx +; X64-NEXT: sbbq %rcx, %rcx +; X64-NEXT: xorq %rcx, %rsi +; X64-NEXT: xorq %rcx, %rax +; X64-NEXT: subq %rcx, %rax +; X64-NEXT: sbbq %rcx, %rsi +; X64-NEXT: negq %rax +; X64-NEXT: sbbq %rsi, %rdi +; X64-NEXT: movq %rdi, %rdx ; X64-NEXT: retq %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index 290894d2712e8..3bee81b61b98a 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -608,25 +608,21 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subb %cl, %dl -; X86-NEXT: negb %dl -; X86-NEXT: subb %cl, %al -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i8: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: subb %dil, %al -; X64-NEXT: negb %al -; X64-NEXT: subb %dil, %sil +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovsl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %cmp = icmp ugt i8 %a, %b @@ -670,9 +666,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: cmovbl %edx, %eax ; X86-NEXT: retl @@ -681,9 +676,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: subl %esi, %edi -; X64-NEXT: cmovael %edi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovael %esi, %eax ; X64-NEXT: retq %cmp = icmp ult i32 %a, %b %ab = sub i32 %a, %b From 8d4235d97e5bd12b8244f9ffc157651a9a288b36 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 13:02:02 +0200 Subject: [PATCH 077/425] [Lint] Fix another scalable vector crash We also need to check that the memory access LocationSize is not scalable. --- llvm/lib/Analysis/Lint.cpp | 3 ++- llvm/test/Analysis/Lint/scalable.ll | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 415b16d25efd2..00e430ce8e0ab 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -454,7 +454,8 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc, // Accesses from before the start or after the end of the object are not // defined. - Check(!Loc.Size.hasValue() || BaseSize == MemoryLocation::UnknownSize || + Check(!Loc.Size.hasValue() || Loc.Size.isScalable() || + BaseSize == MemoryLocation::UnknownSize || (Offset >= 0 && Offset + Loc.Size.getValue() <= BaseSize), "Undefined behavior: Buffer overflow", &I); diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll index 4bcc4dae8d837..bc12d6738d2aa 100644 --- a/llvm/test/Analysis/Lint/scalable.ll +++ b/llvm/test/Analysis/Lint/scalable.ll @@ -7,6 +7,13 @@ define @alloca_access() { ret %v } +; CHECK-NOT: Buffer overflow +define @alloca_access2() { + %a = alloca <256 x i8> + %v = load , ptr %a + ret %v +} + ; CHECK-NOT: insertelement index out of range define @insertelement() { %insert = insertelement poison, half 0xH0000, i64 100 From 55a24738302eb9bb5bf458220deb20ddef60ce51 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 13:04:36 +0200 Subject: [PATCH 078/425] [CtxProf] Replace include with forward declaration (NFC) This header is fairly expensive. Forward declare PGOContextualProfile instead. --- llvm/include/llvm/Transforms/Utils/Cloning.h | 4 ++-- llvm/lib/Transforms/Utils/InlineFunction.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index 2ddcfeb1501e2..a4be24e32c527 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -20,7 +20,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/ValueHandle.h" @@ -42,6 +41,7 @@ class Instruction; class Loop; class LoopInfo; class Module; +class PGOContextualProfile; class ProfileSummaryInfo; class ReturnInst; class DomTreeUpdater; @@ -276,7 +276,7 @@ InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, /// to the behavior of the non-contextual profile updating variant above. This /// makes it easy to drop-in replace uses of the non-contextual overload. InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - CtxProfAnalysis::Result &CtxProf, + PGOContextualProfile &CtxProf, bool MergeAttributes = false, AAResults *CalleeAAR = nullptr, bool InsertLifetime = true, diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 799ef3ab021d3..2e05fa80464b8 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2142,7 +2142,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // it's not worth updating those. static const std::pair, std::vector> remapIndices(Function &Caller, BasicBlock *StartBB, - CtxProfAnalysis::Result &CtxProf, uint32_t CalleeCounters, + PGOContextualProfile &CtxProf, uint32_t CalleeCounters, uint32_t CalleeCallsites) { // We'll allocate a new ID to imported callsite counters and callsites. We're // using -1 to indicate a counter we delete. Most likely the entry ID, for @@ -2258,7 +2258,7 @@ remapIndices(Function &Caller, BasicBlock *StartBB, // copying over the data of the callee, **intentionally without any value // scaling**, and copying over the callees of the inlined callee. llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - CtxProfAnalysis::Result &CtxProf, + PGOContextualProfile &CtxProf, bool MergeAttributes, AAResults *CalleeAAR, bool InsertLifetime, From 43b8ae3cea7c0f45dc29479ba8024e0adae9d145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Wed, 4 Sep 2024 13:11:45 +0200 Subject: [PATCH 079/425] [AMDGPU][LDS] Pre-Commit tests for 'Fix dynamic LDS interaction with "amdgpu-no-lds-kernel-id" (#107091) --- .../AMDGPU/lower-module-lds-zero-size-arr.ll | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll index da1d23f1496cf..c7829be565373 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s ; This is an extension and should be rejected by the front-end in most cases. @@ -6,9 +6,13 @@ @Var0 = linkonce_odr hidden local_unnamed_addr addrspace(3) global [0 x float] poison -define void @fn(float %val, i32 %idx) { +;. +; CHECK: @llvm.amdgcn.kernelA.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds to i32)] +;. +define void @fn(float %val, i32 %idx) #0 { ; CHECK-LABEL: define void @fn( -; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) { +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VAR0]], align 4 @@ -22,9 +26,9 @@ define void @fn(float %val, i32 %idx) { ret void } -define amdgpu_kernel void @kernelA(float %val, i32 %idx) { +define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 { ; CHECK-LABEL: define amdgpu_kernel void @kernelA( -; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds) ] ; CHECK-NEXT: tail call void @fn(float [[VAL]], i32 [[IDX]]) ; CHECK-NEXT: ret void @@ -32,6 +36,14 @@ define amdgpu_kernel void @kernelA(float %val, i32 %idx) { tail call void @fn(float %val, i32 %idx) ret void } + +attributes #0 = { "amdgpu-no-lds-kernel-id" } + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-lds-kernel-id" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. +; CHECK: [[META0]] = !{i32 0, i32 1} ; CHECK: [[META1]] = !{i32 0} ;. From 9ba41031de105d7babf3ae53facd368f2b4e409f Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 4 Sep 2024 12:35:44 +0100 Subject: [PATCH 080/425] [OpenMP]Update use_device_clause lowering (#101703) This patch updates the use_device_ptr and use_device_addr clauses to use the mapInfoOps for lowering. This allows all the types that are handle by the map clauses such as derived types to also be supported by the use_device_clauses. This is patch 1/2 in a series of patches. Co-authored-by: Raghu Maddhipatla raghu.maddhipatla@amd.com --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 145 ++++++++----- flang/lib/Lower/OpenMP/ClauseProcessor.h | 50 ++--- flang/lib/Lower/OpenMP/OpenMP.cpp | 195 +++++++++--------- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 37 +++- flang/test/Lower/OpenMP/target.f90 | 12 +- .../use-device-ptr-to-use-device-addr.f90 | 96 +++++---- 6 files changed, 283 insertions(+), 252 deletions(-) diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index dd6068ba048cc..6dee31ddb6963 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -887,13 +887,64 @@ bool ClauseProcessor::processLink( }); } +void ClauseProcessor::processMapObjects( + lower::StatementContext &stmtCtx, mlir::Location clauseLocation, + const omp::ObjectList &objects, + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits, + std::map> &parentMemberIndices, + llvm::SmallVectorImpl &mapVars, + llvm::SmallVectorImpl *mapSyms, + llvm::SmallVectorImpl *mapSymLocs, + llvm::SmallVectorImpl *mapSymTypes) const { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + for (const omp::Object &object : objects) { + llvm::SmallVector bounds; + std::stringstream asFortran; + + lower::AddrAndBoundsInfo info = + lower::gatherDataOperandAddrAndBounds( + converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(), + object.ref(), clauseLocation, asFortran, bounds, + treatIndexAsSection); + + // Explicit map captures are captured ByRef by default, + // optimisation passes may alter this to ByCopy or other capture + // types to optimise + mlir::Value baseOp = info.rawInput; + auto location = mlir::NameLoc::get( + mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()), + baseOp.getLoc()); + mlir::omp::MapInfoOp mapOp = createMapInfoOp( + firOpBuilder, location, baseOp, + /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, + /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, + static_cast< + std::underlying_type_t>( + mapTypeBits), + mlir::omp::VariableCaptureKind::ByRef, baseOp.getType()); + + if (object.sym()->owner().IsDerivedType()) { + addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, semaCtx); + } else { + mapVars.push_back(mapOp); + if (mapSyms) + mapSyms->push_back(object.sym()); + if (mapSymTypes) + mapSymTypes->push_back(baseOp.getType()); + if (mapSymLocs) + mapSymLocs->push_back(baseOp.getLoc()); + } + } +} + bool ClauseProcessor::processMap( mlir::Location currentLocation, lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result, llvm::SmallVectorImpl *mapSyms, llvm::SmallVectorImpl *mapSymLocs, llvm::SmallVectorImpl *mapSymTypes) const { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); // We always require tracking of symbols, even if the caller does not, // so we create an optionally used local set of symbols when the mapSyms // argument is not present. @@ -948,46 +999,10 @@ bool ClauseProcessor::processMap( mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } - - for (const omp::Object &object : std::get(clause.t)) { - llvm::SmallVector bounds; - std::stringstream asFortran; - - lower::AddrAndBoundsInfo info = - lower::gatherDataOperandAddrAndBounds( - converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(), - object.ref(), clauseLocation, asFortran, bounds, - treatIndexAsSection); - - // Explicit map captures are captured ByRef by default, - // optimisation passes may alter this to ByCopy or other capture - // types to optimise - mlir::Value baseOp = info.rawInput; - auto location = mlir::NameLoc::get( - mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()), - baseOp.getLoc()); - mlir::omp::MapInfoOp mapOp = createMapInfoOp( - firOpBuilder, location, baseOp, - /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, - /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, - static_cast< - std::underlying_type_t>( - mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, baseOp.getType()); - - if (object.sym()->owner().IsDerivedType()) { - addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, - semaCtx); - } else { - result.mapVars.push_back(mapOp); - ptrMapSyms->push_back(object.sym()); - if (mapSymTypes) - mapSymTypes->push_back(baseOp.getType()); - if (mapSymLocs) - mapSymLocs->push_back(baseOp.getLoc()); - } - } + processMapObjects(stmtCtx, clauseLocation, + std::get(clause.t), mapTypeBits, + parentMemberIndices, result.mapVars, ptrMapSyms, + mapSymLocs, mapSymTypes); }); insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars, @@ -1050,27 +1065,55 @@ bool ClauseProcessor::processEnter( } bool ClauseProcessor::processUseDeviceAddr( - mlir::omp::UseDeviceAddrClauseOps &result, + lower::StatementContext &stmtCtx, mlir::omp::UseDeviceAddrClauseOps &result, llvm::SmallVectorImpl &useDeviceTypes, llvm::SmallVectorImpl &useDeviceLocs, llvm::SmallVectorImpl &useDeviceSyms) const { - return findRepeatableClause( - [&](const omp::clause::UseDeviceAddr &clause, const parser::CharBlock &) { - addUseDeviceClause(converter, clause.v, result.useDeviceAddrVars, - useDeviceTypes, useDeviceLocs, useDeviceSyms); + std::map> + parentMemberIndices; + bool clauseFound = findRepeatableClause( + [&](const omp::clause::UseDeviceAddr &clause, + const parser::CharBlock &source) { + mlir::Location location = converter.genLocation(source); + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + processMapObjects(stmtCtx, location, clause.v, mapTypeBits, + parentMemberIndices, result.useDeviceAddrVars, + &useDeviceSyms, &useDeviceLocs, &useDeviceTypes); }); + + insertChildMapInfoIntoParent(converter, parentMemberIndices, + result.useDeviceAddrVars, useDeviceSyms, + &useDeviceTypes, &useDeviceLocs); + return clauseFound; } bool ClauseProcessor::processUseDevicePtr( - mlir::omp::UseDevicePtrClauseOps &result, + lower::StatementContext &stmtCtx, mlir::omp::UseDevicePtrClauseOps &result, llvm::SmallVectorImpl &useDeviceTypes, llvm::SmallVectorImpl &useDeviceLocs, llvm::SmallVectorImpl &useDeviceSyms) const { - return findRepeatableClause( - [&](const omp::clause::UseDevicePtr &clause, const parser::CharBlock &) { - addUseDeviceClause(converter, clause.v, result.useDevicePtrVars, - useDeviceTypes, useDeviceLocs, useDeviceSyms); + std::map> + parentMemberIndices; + bool clauseFound = findRepeatableClause( + [&](const omp::clause::UseDevicePtr &clause, + const parser::CharBlock &source) { + mlir::Location location = converter.genLocation(source); + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + processMapObjects(stmtCtx, location, clause.v, mapTypeBits, + parentMemberIndices, result.useDevicePtrVars, + &useDeviceSyms, &useDeviceLocs, &useDeviceTypes); }); + + insertChildMapInfoIntoParent(converter, parentMemberIndices, + result.useDevicePtrVars, useDeviceSyms, + &useDeviceTypes, &useDeviceLocs); + return clauseFound; } } // namespace omp diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 4a90f667c7248..f6b319c726a2d 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -128,11 +128,13 @@ class ClauseProcessor { nullptr) const; bool processTo(llvm::SmallVectorImpl &result) const; bool processUseDeviceAddr( + lower::StatementContext &stmtCtx, mlir::omp::UseDeviceAddrClauseOps &result, llvm::SmallVectorImpl &useDeviceTypes, llvm::SmallVectorImpl &useDeviceLocs, llvm::SmallVectorImpl &useDeviceSyms) const; bool processUseDevicePtr( + lower::StatementContext &stmtCtx, mlir::omp::UseDevicePtrClauseOps &result, llvm::SmallVectorImpl &useDeviceTypes, llvm::SmallVectorImpl &useDeviceLocs, @@ -172,6 +174,17 @@ class ClauseProcessor { template bool markClauseOccurrence(mlir::UnitAttr &result) const; + void processMapObjects( + lower::StatementContext &stmtCtx, mlir::Location clauseLocation, + const omp::ObjectList &objects, + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits, + std::map> &parentMemberIndices, + llvm::SmallVectorImpl &mapVars, + llvm::SmallVectorImpl *mapSyms, + llvm::SmallVectorImpl *mapSymLocs = nullptr, + llvm::SmallVectorImpl *mapSymTypes = nullptr) const; + lower::AbstractConverter &converter; semantics::SemanticsContext &semaCtx; List clauses; @@ -188,7 +201,6 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, bool clauseFound = findRepeatableClause( [&](const T &clause, const parser::CharBlock &source) { mlir::Location clauseLocation = converter.genLocation(source); - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); static_assert(std::is_same_v || std::is_same_v); @@ -199,39 +211,9 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - auto &objects = std::get(clause.t); - for (const omp::Object &object : objects) { - llvm::SmallVector bounds; - std::stringstream asFortran; - - lower::AddrAndBoundsInfo info = - lower::gatherDataOperandAddrAndBounds( - converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(), - object.ref(), clauseLocation, asFortran, bounds, - treatIndexAsSection); - - // Explicit map captures are captured ByRef by default, - // optimisation passes may alter this to ByCopy or other capture - // types to optimise - mlir::Value baseOp = info.rawInput; - mlir::omp::MapInfoOp mapOp = createMapInfoOp( - firOpBuilder, clauseLocation, baseOp, - /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, - /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, - static_cast< - std::underlying_type_t>( - mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, baseOp.getType()); - - if (object.sym()->owner().IsDerivedType()) { - addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, - semaCtx); - } else { - result.mapVars.push_back(mapOp); - mapSymbols.push_back(object.sym()); - } - } + processMapObjects(stmtCtx, clauseLocation, + std::get(clause.t), mapTypeBits, + parentMemberIndices, result.mapVars, &mapSymbols); }); insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars, diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 2fd5d4b33074e..8b77f1ac6b4ff 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -702,45 +702,94 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info, marker->erase(); } +void mapBodySymbols(lower::AbstractConverter &converter, mlir::Region ®ion, + llvm::ArrayRef mapSyms) { + assert(region.hasOneBlock() && "target must have single region"); + mlir::Block ®ionBlock = region.front(); + // Clones the `bounds` placing them inside the target region and returns them. + auto cloneBound = [&](mlir::Value bound) { + if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { + mlir::Operation *clonedOp = bound.getDefiningOp()->clone(); + regionBlock.push_back(clonedOp); + return clonedOp->getResult(0); + } + TODO(converter.getCurrentLocation(), + "target map clause operand unsupported bound type"); + }; + + auto cloneBounds = [cloneBound](llvm::ArrayRef bounds) { + llvm::SmallVector clonedBounds; + for (mlir::Value bound : bounds) + clonedBounds.emplace_back(cloneBound(bound)); + return clonedBounds; + }; + + // Bind the symbols to their corresponding block arguments. + for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) { + const mlir::BlockArgument &arg = region.getArgument(argIndex); + // Avoid capture of a reference to a structured binding. + const semantics::Symbol *sym = argSymbol; + // Structure component symbols don't have bindings. + if (sym->owner().IsDerivedType()) + continue; + fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym); + auto refType = mlir::dyn_cast(arg.getType()); + if (refType && fir::isa_builtin_cptr_type(refType.getElementType())) { + converter.bindSymbol(*argSymbol, arg); + } else { + extVal.match( + [&](const fir::BoxValue &v) { + converter.bindSymbol(*sym, + fir::BoxValue(arg, cloneBounds(v.getLBounds()), + v.getExplicitParameters(), + v.getExplicitExtents())); + }, + [&](const fir::MutableBoxValue &v) { + converter.bindSymbol( + *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()), + v.getMutableProperties())); + }, + [&](const fir::ArrayBoxValue &v) { + converter.bindSymbol( + *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()), + cloneBounds(v.getLBounds()), + v.getSourceBox())); + }, + [&](const fir::CharArrayBoxValue &v) { + converter.bindSymbol( + *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()), + cloneBounds(v.getExtents()), + cloneBounds(v.getLBounds()))); + }, + [&](const fir::CharBoxValue &v) { + converter.bindSymbol( + *sym, fir::CharBoxValue(arg, cloneBound(v.getLen()))); + }, + [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); }, + [&](const auto &) { + TODO(converter.getCurrentLocation(), + "target map clause operand unsupported type"); + }); + } + } +} + static void genBodyOfTargetDataOp( lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::omp::TargetDataOp &dataOp, llvm::ArrayRef useDeviceTypes, - llvm::ArrayRef useDeviceLocs, + mlir::omp::TargetDataOp &dataOp, llvm::ArrayRef useDeviceSymbols, + llvm::ArrayRef useDeviceLocs, + llvm::ArrayRef useDeviceTypes, const mlir::Location ¤tLocation, const ConstructQueue &queue, ConstructQueue::const_iterator item) { + assert(useDeviceTypes.size() == useDeviceLocs.size()); + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); mlir::Region ®ion = dataOp.getRegion(); - firOpBuilder.createBlock(®ion, {}, useDeviceTypes, useDeviceLocs); - for (auto [argIndex, argSymbol] : llvm::enumerate(useDeviceSymbols)) { - const mlir::BlockArgument &arg = region.front().getArgument(argIndex); - fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*argSymbol); - if (auto refType = mlir::dyn_cast(arg.getType())) { - if (fir::isa_builtin_cptr_type(refType.getElementType())) { - converter.bindSymbol(*argSymbol, arg); - } else { - // Avoid capture of a reference to a structured binding. - const semantics::Symbol *sym = argSymbol; - extVal.match( - [&](const fir::MutableBoxValue &mbv) { - converter.bindSymbol( - *sym, - fir::MutableBoxValue( - arg, fir::factory::getNonDeferredLenParams(extVal), {})); - }, - [&](const auto &) { - TODO(converter.getCurrentLocation(), - "use_device clause operand unsupported type"); - }); - } - } else { - TODO(converter.getCurrentLocation(), - "use_device clause operand unsupported type"); - } - } + mapBodySymbols(converter, region, useDeviceSymbols); // Insert dummy instruction to remember the insertion position. The // marker will be deleted by clean up passes since there are no uses. @@ -748,7 +797,7 @@ static void genBodyOfTargetDataOp( // there are hlfir.declares inserted above while setting block arguments // and new code from the body should be inserted after that. mlir::Value undefMarker = firOpBuilder.create( - dataOp.getOperation()->getLoc(), firOpBuilder.getIndexType()); + dataOp.getLoc(), firOpBuilder.getIndexType()); // Create blocks for unstructured regions. This has to be done since // blocks are initially allocated with the function as the parent region. @@ -806,93 +855,33 @@ static void genBodyOfTargetOp( mlir::omp::TargetOp &targetOp, llvm::ArrayRef mapSyms, llvm::ArrayRef mapSymLocs, - llvm::ArrayRef mapSymTypes, DataSharingProcessor &dsp, + llvm::ArrayRef mapSymTypes, const mlir::Location ¤tLocation, const ConstructQueue &queue, - ConstructQueue::const_iterator item) { + ConstructQueue::const_iterator item, DataSharingProcessor &dsp) { assert(mapSymTypes.size() == mapSymLocs.size()); fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); mlir::Region ®ion = targetOp.getRegion(); llvm::SmallVector allRegionArgTypes; + llvm::SmallVector allRegionArgLocs; mergePrivateVarsInfo(targetOp, mapSymTypes, llvm::function_ref{ [](mlir::Value v) { return v.getType(); }}, allRegionArgTypes); - llvm::SmallVector allRegionArgLocs; mergePrivateVarsInfo(targetOp, mapSymLocs, llvm::function_ref{ [](mlir::Value v) { return v.getLoc(); }}, allRegionArgLocs); - auto *regionBlock = firOpBuilder.createBlock(®ion, {}, allRegionArgTypes, - allRegionArgLocs); + mlir::Block *regionBlock = firOpBuilder.createBlock( + ®ion, {}, allRegionArgTypes, allRegionArgLocs); - // Clones the `bounds` placing them inside the target region and returns them. - auto cloneBound = [&](mlir::Value bound) { - if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { - mlir::Operation *clonedOp = bound.getDefiningOp()->clone(); - regionBlock->push_back(clonedOp); - return clonedOp->getResult(0); - } - TODO(converter.getCurrentLocation(), - "target map clause operand unsupported bound type"); - }; - - auto cloneBounds = [cloneBound](llvm::ArrayRef bounds) { - llvm::SmallVector clonedBounds; - for (mlir::Value bound : bounds) - clonedBounds.emplace_back(cloneBound(bound)); - return clonedBounds; - }; - - // Bind the symbols to their corresponding block arguments. - for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) { - const mlir::BlockArgument &arg = region.getArgument(argIndex); - // Avoid capture of a reference to a structured binding. - const semantics::Symbol *sym = argSymbol; - // Structure component symbols don't have bindings. - if (sym->owner().IsDerivedType()) - continue; - fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym); - extVal.match( - [&](const fir::BoxValue &v) { - converter.bindSymbol(*sym, - fir::BoxValue(arg, cloneBounds(v.getLBounds()), - v.getExplicitParameters(), - v.getExplicitExtents())); - }, - [&](const fir::MutableBoxValue &v) { - converter.bindSymbol( - *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()), - v.getMutableProperties())); - }, - [&](const fir::ArrayBoxValue &v) { - converter.bindSymbol( - *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()), - v.getSourceBox())); - }, - [&](const fir::CharArrayBoxValue &v) { - converter.bindSymbol( - *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()), - cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()))); - }, - [&](const fir::CharBoxValue &v) { - converter.bindSymbol(*sym, - fir::CharBoxValue(arg, cloneBound(v.getLen()))); - }, - [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); }, - [&](const auto &) { - TODO(converter.getCurrentLocation(), - "target map clause operand unsupported type"); - }); - } + mapBodySymbols(converter, region, mapSyms); for (auto [argIndex, argSymbol] : - llvm::enumerate(dsp.getDelayedPrivSymbols())) { + llvm::enumerate(dsp.getAllSymbolsToPrivatize())) { argIndex = mapSyms.size() + argIndex; const mlir::BlockArgument &arg = region.getArgument(argIndex); @@ -940,7 +929,9 @@ static void genBodyOfTargetOp( std::underlying_type_t>( llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); + targetOp.getMapVarsMutable().append(mapOp); + mlir::Value clonedValArg = region.addArgument(copyVal.getType(), copyVal.getLoc()); firOpBuilder.setInsertionPointToStart(regionBlock); @@ -962,7 +953,7 @@ static void genBodyOfTargetOp( // In the HLFIR flow there are hlfir.declares inserted above while // setting block arguments. mlir::Value undefMarker = firOpBuilder.create( - targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType()); + targetOp.getLoc(), firOpBuilder.getIndexType()); // Create blocks for unstructured regions. This has to be done since // blocks are initially allocated with the function as the parent region. @@ -1201,9 +1192,9 @@ static void genTargetDataClauses( cp.processDevice(stmtCtx, clauseOps); cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps); cp.processMap(loc, stmtCtx, clauseOps); - cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs, + cp.processUseDeviceAddr(stmtCtx, clauseOps, useDeviceTypes, useDeviceLocs, useDeviceSyms); - cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs, + cp.processUseDevicePtr(stmtCtx, clauseOps, useDeviceTypes, useDeviceLocs, useDeviceSyms); // This function implements the deprecated functionality of use_device_ptr @@ -1807,7 +1798,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, auto targetOp = firOpBuilder.create(loc, clauseOps); genBodyOfTargetOp(converter, symTable, semaCtx, eval, targetOp, mapSyms, - mapLocs, mapTypes, dsp, loc, queue, item); + mapLocs, mapTypes, loc, queue, item, dsp); return targetOp; } @@ -1829,7 +1820,7 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable, converter.getFirOpBuilder().create(loc, clauseOps); genBodyOfTargetDataOp(converter, symTable, semaCtx, eval, targetDataOp, - useDeviceTypes, useDeviceLocs, useDeviceSyms, loc, + useDeviceSyms, useDeviceLocs, useDeviceTypes, loc, queue, item); return targetDataOp; } diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 04a11a52dbd04..7ebeb51cf3dec 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -125,13 +125,12 @@ class MapInfoFinalizationPass // TODO: map the addendum segment of the descriptor, similarly to the // above base address/data pointer member. - if (auto mapClauseOwner = - llvm::dyn_cast(target)) { + auto addOperands = [&](mlir::OperandRange &operandsArr, + mlir::MutableOperandRange &mutableOpRange, + auto directiveOp) { llvm::SmallVector newMapOps; - mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars(); - - for (size_t i = 0; i < mapVarsArr.size(); ++i) { - if (mapVarsArr[i] == op) { + for (size_t i = 0; i < operandsArr.size(); ++i) { + if (operandsArr[i] == op) { // Push new implicit maps generated for the descriptor. newMapOps.push_back(baseAddr); @@ -139,13 +138,29 @@ class MapInfoFinalizationPass // new additional map operand with an appropriate BlockArgument, // as the printing and later processing currently requires a 1:1 // mapping of BlockArgs to MapInfoOp's at the same placement in - // each array (BlockArgs and MapVars). - if (auto targetOp = llvm::dyn_cast(target)) - targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc); + // each array (BlockArgs and MapOperands). + if (directiveOp) { + directiveOp.getRegion().insertArgument(i, baseAddr.getType(), loc); + } } - newMapOps.push_back(mapVarsArr[i]); + newMapOps.push_back(operandsArr[i]); } - mapClauseOwner.getMapVarsMutable().assign(newMapOps); + mutableOpRange.assign(newMapOps); + }; + if (auto mapClauseOwner = + llvm::dyn_cast(target)) { + mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapVars(); + mlir::MutableOperandRange mapMutableOpRange = + mapClauseOwner.getMapVarsMutable(); + mlir::omp::TargetOp targetOp = + llvm::dyn_cast(target); + addOperands(mapOperandsArr, mapMutableOpRange, targetOp); + } + if (auto targetDataOp = llvm::dyn_cast(target)) { + mlir::OperandRange useDevAddrArr = targetDataOp.getUseDeviceAddrVars(); + mlir::MutableOperandRange useDevAddrMutableOpRange = + targetDataOp.getUseDeviceAddrVarsMutable(); + addOperands(useDevAddrArr, useDevAddrMutableOpRange, targetDataOp); } mlir::Value newDescParentMapOp = builder.create( diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 1d5ab6942dfa3..e86a2f9b6098d 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -526,21 +526,23 @@ end subroutine omp_target_device_ptr !=============================================================================== !CHECK-LABEL: func.func @_QPomp_target_device_addr() { - subroutine omp_target_device_addr +subroutine omp_target_device_addr integer, pointer :: a !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"} !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr> {name = ""} !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} - !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref>>) { + !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr> {name = ""} + !CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(tofrom) capture(ByRef) members(%[[DEV_ADDR_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} + !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]], %[[DEV_ADDR]] : {{.*}}) { !$omp target data map(tofrom: a) use_device_addr(a) - !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref>>): - !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + !CHECK: ^bb0(%[[ARG_0:.*]]: !fir.llvm_ptr>, %[[ARG_1:.*]]: !fir.ref>>): + !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[ARG_1]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) !CHECK: %[[C10:.*]] = arith.constant 10 : i32 !CHECK: %[[A_BOX:.*]] = fir.load %[[VAL_1_DECL]]#0 : !fir.ref>> !CHECK: %[[A_ADDR:.*]] = fir.box_addr %[[A_BOX]] : (!fir.box>) -> !fir.ptr !CHECK: hlfir.assign %[[C10]] to %[[A_ADDR]] : i32, !fir.ptr - a = 10 + a = 10 !CHECK: omp.terminator !$omp end target data !CHECK: } diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 index acb5f533b619e..085f5419fa7f8 100644 --- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 +++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 @@ -2,72 +2,70 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s -! This tests primary goal is to check the promotion of -! non-CPTR arguments from use_device_ptr to -! use_device_addr works, without breaking any -! functionality +! This tests primary goal is to check the promotion of non-CPTR arguments from +! use_device_ptr to use_device_addr works, without breaking any functionality. !CHECK: func.func @{{.*}}only_use_device_ptr() -!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>>>) use_device_ptr(%{{.*}} : !fir.ref>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>): -subroutine only_use_device_ptr +!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.llvm_ptr>>, !fir.ref>>>) use_device_ptr(%{{.*}} : !fir.ref>) { +!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>): +subroutine only_use_device_ptr use iso_c_binding integer, pointer, dimension(:) :: array real, pointer :: pa(:) type(c_ptr) :: cptr - !$omp target data use_device_ptr(pa, cptr, array) - !$omp end target data -end subroutine + !$omp target data use_device_ptr(pa, cptr, array) + !$omp end target data + end subroutine !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr() -!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>>>) use_device_ptr({{.*}} : !fir.ref>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): -subroutine mix_use_device_ptr_and_addr +!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.llvm_ptr>>, !fir.ref>>>) use_device_ptr({{.*}} : !fir.ref>) { +!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): +subroutine mix_use_device_ptr_and_addr use iso_c_binding integer, pointer, dimension(:) :: array real, pointer :: pa(:) type(c_ptr) :: cptr - !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) - !$omp end target data -end subroutine + !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) + !$omp end target data + end subroutine -!CHECK: func.func @{{.*}}only_use_device_addr() -!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>, !fir.ref>>>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): -subroutine only_use_device_addr - use iso_c_binding - integer, pointer, dimension(:) :: array - real, pointer :: pa(:) - type(c_ptr) :: cptr + !CHECK: func.func @{{.*}}only_use_device_addr() + !CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.ref>, !fir.llvm_ptr>>, !fir.ref>>>) { + !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>>>): + subroutine only_use_device_addr + use iso_c_binding + integer, pointer, dimension(:) :: array + real, pointer :: pa(:) + type(c_ptr) :: cptr - !$omp target data use_device_addr(pa, cptr, array) - !$omp end target data -end subroutine + !$omp target data use_device_addr(pa, cptr, array) + !$omp end target data + end subroutine -!CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map() -!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref, !fir.ref) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>>>) use_device_ptr(%{{.*}} : !fir.ref>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): -subroutine mix_use_device_ptr_and_addr_and_map - use iso_c_binding - integer :: i, j - integer, pointer, dimension(:) :: array - real, pointer :: pa(:) - type(c_ptr) :: cptr + !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map() + !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref, !fir.ref) use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.llvm_ptr>>, !fir.ref>>>) use_device_ptr(%{{.*}} : !fir.ref>) { + !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.llvm_ptr>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): + subroutine mix_use_device_ptr_and_addr_and_map + use iso_c_binding + integer :: i, j + integer, pointer, dimension(:) :: array + real, pointer :: pa(:) + type(c_ptr) :: cptr - !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) map(tofrom: i, j) - !$omp end target data -end subroutine + !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) map(tofrom: i, j) + !$omp end target data + end subroutine -!CHECK: func.func @{{.*}}only_use_map() -!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.ref>, !fir.llvm_ptr>>, !fir.ref>>>) { -subroutine only_use_map - use iso_c_binding - integer, pointer, dimension(:) :: array - real, pointer :: pa(:) - type(c_ptr) :: cptr + !CHECK: func.func @{{.*}}only_use_map() + !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr>>, !fir.ref>>>, !fir.ref>, !fir.llvm_ptr>>, !fir.ref>>>) { + subroutine only_use_map + use iso_c_binding + integer, pointer, dimension(:) :: array + real, pointer :: pa(:) + type(c_ptr) :: cptr - !$omp target data map(pa, cptr, array) - !$omp end target data -end subroutine + !$omp target data map(pa, cptr, array) + !$omp end target data + end subroutine From 2cf36f0293daf0fa28d5c7d5d3617660edf237e7 Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 4 Sep 2024 12:36:03 +0100 Subject: [PATCH 081/425] [OpenMP]Update use_device_clause lowering (#101707) This patch updates the use_device_ptr and use_device_addr clauses to use the mapInfoOps for lowering. This allows all the types that are handle by the map clauses such as derived types to also be supported by the use_device_clauses. This is patch 2/2 in a series of patches. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 354 +++++++++--------- mlir/test/Target/LLVMIR/omptarget-llvm.mlir | 16 +- .../openmp-target-use-device-nested.mlir | 39 ++ 4 files changed, 228 insertions(+), 183 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 027b927fa6424..71d51affba642 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6371,6 +6371,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( if (!updateToLocation(Loc)) return InsertPointTy(); + Builder.restoreIP(CodeGenIP); // Disable TargetData CodeGen on Device pass. if (Config.IsTargetDevice.value_or(false)) { if (BodyGenCB) @@ -6378,7 +6379,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( return Builder.saveIP(); } - Builder.restoreIP(CodeGenIP); bool IsStandAlone = !BodyGenCB; MapInfosTy *MapInfo; // Generate the code for the opening of the data environment. Capture all the diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 6a32aeb444140..d597bbfee2fe1 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2260,6 +2260,7 @@ getRefPtrIfDeclareTarget(mlir::Value value, return nullptr; } +namespace { // A small helper structure to contain data gathered // for map lowering and coalese it into one area and // avoiding extra computations such as searches in the @@ -2269,6 +2270,8 @@ getRefPtrIfDeclareTarget(mlir::Value value, struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy { llvm::SmallVector IsDeclareTarget; llvm::SmallVector IsAMember; + // Identify if mapping was added by mapClause or use_device clauses. + llvm::SmallVector IsAMapping; llvm::SmallVector MapClause; llvm::SmallVector OriginalValue; // Stripped off array/pointer to get the underlying @@ -2286,6 +2289,7 @@ struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy { llvm::OpenMPIRBuilder::MapInfosTy::append(CurInfo); } }; +} // namespace uint64_t getArrayElementSizeInBits(LLVM::LLVMArrayType arrTy, DataLayout &dl) { if (auto nestedArrTy = llvm::dyn_cast_if_present( @@ -2352,80 +2356,126 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type, return builder.getInt64(dl.getTypeSizeInBits(type) / 8); } -void collectMapDataFromMapVars(MapInfoData &mapData, - llvm::SmallVectorImpl &mapVars, - LLVM::ModuleTranslation &moduleTranslation, - DataLayout &dl, llvm::IRBuilderBase &builder) { - for (mlir::Value mapValue : mapVars) { - if (auto mapOp = mlir::dyn_cast_if_present( - mapValue.getDefiningOp())) { - mlir::Value offloadPtr = +static void collectMapDataFromMapOperands( + MapInfoData &mapData, SmallVectorImpl &mapVars, + LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl, + llvm::IRBuilderBase &builder, const ArrayRef &useDevPtrOperands = {}, + const ArrayRef &useDevAddrOperands = {}) { + auto checkIsAMember = [](const auto &mapVars, auto mapOp) { + // Check if this is a member mapping and correctly assign that it is, if + // it is a member of a larger object. + // TODO: Need better handling of members, and distinguishing of members + // that are implicitly allocated on device vs explicitly passed in as + // arguments. + // TODO: May require some further additions to support nested record + // types, i.e. member maps that can have member maps. + for (Value mapValue : mapVars) { + auto map = cast(mapValue.getDefiningOp()); + for (auto member : map.getMembers()) + if (member == mapOp) + return true; + } + return false; + }; + + // Process MapOperands + for (Value mapValue : mapVars) { + auto mapOp = cast(mapValue.getDefiningOp()); + Value offloadPtr = + mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr(); + mapData.OriginalValue.push_back(moduleTranslation.lookupValue(offloadPtr)); + mapData.Pointers.push_back(mapData.OriginalValue.back()); + + if (llvm::Value *refPtr = + getRefPtrIfDeclareTarget(offloadPtr, + moduleTranslation)) { // declare target + mapData.IsDeclareTarget.push_back(true); + mapData.BasePointers.push_back(refPtr); + } else { // regular mapped variable + mapData.IsDeclareTarget.push_back(false); + mapData.BasePointers.push_back(mapData.OriginalValue.back()); + } + + mapData.BaseType.push_back( + moduleTranslation.convertType(mapOp.getVarType())); + mapData.Sizes.push_back( + getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(), + mapData.BaseType.back(), builder, moduleTranslation)); + mapData.MapClause.push_back(mapOp.getOperation()); + mapData.Types.push_back( + llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value())); + mapData.Names.push_back(LLVM::createMappingInformation( + mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder())); + mapData.DevicePointers.push_back(llvm::OpenMPIRBuilder::DeviceInfoTy::None); + mapData.IsAMapping.push_back(true); + mapData.IsAMember.push_back(checkIsAMember(mapVars, mapOp)); + } + + auto findMapInfo = [&mapData](llvm::Value *val, + llvm::OpenMPIRBuilder::DeviceInfoTy devInfoTy) { + unsigned index = 0; + bool found = false; + for (llvm::Value *basePtr : mapData.OriginalValue) { + if (basePtr == val && mapData.IsAMapping[index]) { + found = true; + mapData.Types[index] |= + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + mapData.DevicePointers[index] = devInfoTy; + } + index++; + } + return found; + }; + + // Process useDevPtr(Addr)Operands + auto addDevInfos = [&](const llvm::ArrayRef &useDevOperands, + llvm::OpenMPIRBuilder::DeviceInfoTy devInfoTy) { + for (Value mapValue : useDevOperands) { + auto mapOp = cast(mapValue.getDefiningOp()); + Value offloadPtr = mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr(); - mapData.OriginalValue.push_back( - moduleTranslation.lookupValue(offloadPtr)); - mapData.Pointers.push_back(mapData.OriginalValue.back()); - - if (llvm::Value *refPtr = - getRefPtrIfDeclareTarget(offloadPtr, - moduleTranslation)) { // declare target - mapData.IsDeclareTarget.push_back(true); - mapData.BasePointers.push_back(refPtr); - } else { // regular mapped variable + llvm::Value *origValue = moduleTranslation.lookupValue(offloadPtr); + + // Check if map info is already present for this entry. + if (!findMapInfo(origValue, devInfoTy)) { + mapData.OriginalValue.push_back(origValue); + mapData.Pointers.push_back(mapData.OriginalValue.back()); mapData.IsDeclareTarget.push_back(false); mapData.BasePointers.push_back(mapData.OriginalValue.back()); - } - - mapData.BaseType.push_back( - moduleTranslation.convertType(mapOp.getVarType())); - mapData.Sizes.push_back( - getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(), - mapData.BaseType.back(), builder, moduleTranslation)); - mapData.MapClause.push_back(mapOp.getOperation()); - mapData.Types.push_back( - llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value())); - mapData.Names.push_back(LLVM::createMappingInformation( - mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder())); - mapData.DevicePointers.push_back( - llvm::OpenMPIRBuilder::DeviceInfoTy::None); - - // Check if this is a member mapping and correctly assign that it is, if - // it is a member of a larger object. - // TODO: Need better handling of members, and distinguishing of members - // that are implicitly allocated on device vs explicitly passed in as - // arguments. - // TODO: May require some further additions to support nested record - // types, i.e. member maps that can have member maps. - mapData.IsAMember.push_back(false); - for (mlir::Value mapValue : mapVars) { - if (auto map = mlir::dyn_cast_if_present( - mapValue.getDefiningOp())) { - for (auto member : map.getMembers()) { - if (member == mapOp) { - mapData.IsAMember.back() = true; - } - } - } + mapData.BaseType.push_back( + moduleTranslation.convertType(mapOp.getVarType())); + mapData.Sizes.push_back(builder.getInt64(0)); + mapData.MapClause.push_back(mapOp.getOperation()); + mapData.Types.push_back( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM); + mapData.Names.push_back(LLVM::createMappingInformation( + mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder())); + mapData.DevicePointers.push_back(devInfoTy); + mapData.IsAMapping.push_back(true); + mapData.IsAMember.push_back(checkIsAMember(useDevOperands, mapOp)); } } - } + }; + + addDevInfos(useDevPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer); + addDevInfos(useDevAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address); } -static int getMapDataMemberIdx(MapInfoData &mapData, - mlir::omp::MapInfoOp memberOp) { +static int getMapDataMemberIdx(MapInfoData &mapData, omp::MapInfoOp memberOp) { auto *res = llvm::find(mapData.MapClause, memberOp); assert(res != mapData.MapClause.end() && "MapInfoOp for member not found in MapData, cannot return index"); return std::distance(mapData.MapClause.begin(), res); } -static mlir::omp::MapInfoOp -getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) { - mlir::DenseIntElementsAttr indexAttr = mapInfo.getMembersIndexAttr(); +static omp::MapInfoOp getFirstOrLastMappedMemberPtr(omp::MapInfoOp mapInfo, + bool first) { + DenseIntElementsAttr indexAttr = mapInfo.getMembersIndexAttr(); // Only 1 member has been mapped, we can return it. if (indexAttr.size() == 1) - if (auto mapOp = mlir::dyn_cast( - mapInfo.getMembers()[0].getDefiningOp())) + if (auto mapOp = + dyn_cast(mapInfo.getMembers()[0].getDefiningOp())) return mapOp; llvm::ArrayRef shape = indexAttr.getShapedType().getShape(); @@ -2462,7 +2512,7 @@ getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) { return false; }); - return llvm::cast( + return llvm::cast( mapInfo.getMembers()[indices.front()].getDefiningOp()); } @@ -2488,7 +2538,7 @@ getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) { std::vector calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder, bool isArrayTy, - mlir::OperandRange bounds) { + OperandRange bounds) { std::vector idx; // There's no bounds to calculate an offset from, we can safely // ignore and return no indices. @@ -2502,7 +2552,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation, if (isArrayTy) { idx.push_back(builder.getInt64(0)); for (int i = bounds.size() - 1; i >= 0; --i) { - if (auto boundOp = mlir::dyn_cast_if_present( + if (auto boundOp = dyn_cast_if_present( bounds[i].getDefiningOp())) { idx.push_back(moduleTranslation.lookupValue(boundOp.getLowerBound())); } @@ -2528,7 +2578,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation, // (extent/size of current) 100 for 1000 for each index increment std::vector dimensionIndexSizeOffset{builder.getInt64(1)}; for (size_t i = 1; i < bounds.size(); ++i) { - if (auto boundOp = mlir::dyn_cast_if_present( + if (auto boundOp = dyn_cast_if_present( bounds[i].getDefiningOp())) { dimensionIndexSizeOffset.push_back(builder.CreateMul( moduleTranslation.lookupValue(boundOp.getExtent()), @@ -2541,7 +2591,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation, // have calculated in the previous and accumulate the results to get // our final resulting offset. for (int i = bounds.size() - 1; i >= 0; --i) { - if (auto boundOp = mlir::dyn_cast_if_present( + if (auto boundOp = dyn_cast_if_present( bounds[i].getDefiningOp())) { if (idx.empty()) idx.emplace_back(builder.CreateMul( @@ -2585,7 +2635,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers( ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE); combinedInfo.DevicePointers.emplace_back( - llvm::OpenMPIRBuilder::DeviceInfoTy::None); + mapData.DevicePointers[mapDataIndex]); combinedInfo.Names.emplace_back(LLVM::createMappingInformation( mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder)); combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]); @@ -2598,7 +2648,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers( // data by the descriptor (which itself, is a structure containing // runtime information on the dynamically allocated data). auto parentClause = - llvm::cast(mapData.MapClause[mapDataIndex]); + llvm::cast(mapData.MapClause[mapDataIndex]); llvm::Value *lowAddr, *highAddr; if (!parentClause.getPartialMap()) { @@ -2610,8 +2660,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers( builder.getPtrTy()); combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]); } else { - auto mapOp = - mlir::dyn_cast(mapData.MapClause[mapDataIndex]); + auto mapOp = dyn_cast(mapData.MapClause[mapDataIndex]); int firstMemberIdx = getMapDataMemberIdx( mapData, getFirstOrLastMappedMemberPtr(mapOp, true)); lowAddr = builder.CreatePointerCast(mapData.Pointers[firstMemberIdx], @@ -2669,7 +2718,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers( // There may be a better way to verify this, but unfortunately with // opaque pointers we lose the ability to easily check if something is // a pointer whilst maintaining access to the underlying type. -static bool checkIfPointerMap(mlir::omp::MapInfoOp mapOp) { +static bool checkIfPointerMap(omp::MapInfoOp mapOp) { // If we have a varPtrPtr field assigned then the underlying type is a pointer if (mapOp.getVarPtrPtr()) return true; @@ -2691,11 +2740,11 @@ static void processMapMembersWithParent( uint64_t mapDataIndex, llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) { auto parentClause = - llvm::cast(mapData.MapClause[mapDataIndex]); + llvm::cast(mapData.MapClause[mapDataIndex]); for (auto mappedMembers : parentClause.getMembers()) { auto memberClause = - llvm::cast(mappedMembers.getDefiningOp()); + llvm::cast(mappedMembers.getDefiningOp()); int memberDataIdx = getMapDataMemberIdx(mapData, memberClause); assert(memberDataIdx >= 0 && "could not find mapped member of structure"); @@ -2712,7 +2761,7 @@ static void processMapMembersWithParent( combinedInfo.Types.emplace_back(mapFlag); combinedInfo.DevicePointers.emplace_back( - llvm::OpenMPIRBuilder::DeviceInfoTy::None); + mapData.DevicePointers[memberDataIdx]); combinedInfo.Names.emplace_back( LLVM::createMappingInformation(memberClause.getLoc(), ompBuilder)); combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]); @@ -2729,8 +2778,7 @@ processIndividualMap(MapInfoData &mapData, size_t mapDataIdx, // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're // marked with OMP_MAP_PTR_AND_OBJ instead. auto mapFlag = mapData.Types[mapDataIdx]; - auto mapInfoOp = - llvm::cast(mapData.MapClause[mapDataIdx]); + auto mapInfoOp = llvm::cast(mapData.MapClause[mapDataIdx]); bool isPtrTy = checkIfPointerMap(mapInfoOp); if (isPtrTy) @@ -2740,7 +2788,7 @@ processIndividualMap(MapInfoData &mapData, size_t mapDataIdx, mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM; if (mapInfoOp.getMapCaptureType().value() == - mlir::omp::VariableCaptureKind::ByCopy && + omp::VariableCaptureKind::ByCopy && !isPtrTy) mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL; @@ -2766,13 +2814,13 @@ static void processMapWithMembersOf( llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData, uint64_t mapDataIndex, bool isTargetParams) { auto parentClause = - llvm::cast(mapData.MapClause[mapDataIndex]); + llvm::cast(mapData.MapClause[mapDataIndex]); // If we have a partial map (no parent referenced in the map clauses of the // directive, only members) and only a single member, we do not need to bind // the map of the member to the parent, we can pass the member separately. if (parentClause.getMembers().size() == 1 && parentClause.getPartialMap()) { - auto memberClause = llvm::cast( + auto memberClause = llvm::cast( parentClause.getMembers()[0].getDefiningOp()); int memberDataIdx = getMapDataMemberIdx(mapData, memberClause); // Note: Clang treats arrays with explicit bounds that fall into this @@ -2809,11 +2857,9 @@ createAlteredByCaptureMap(MapInfoData &mapData, for (size_t i = 0; i < mapData.MapClause.size(); ++i) { // if it's declare target, skip it, it's handled separately. if (!mapData.IsDeclareTarget[i]) { - auto mapOp = - mlir::dyn_cast_if_present(mapData.MapClause[i]); - mlir::omp::VariableCaptureKind captureKind = - mapOp.getMapCaptureType().value_or( - mlir::omp::VariableCaptureKind::ByRef); + auto mapOp = cast(mapData.MapClause[i]); + omp::VariableCaptureKind captureKind = + mapOp.getMapCaptureType().value_or(omp::VariableCaptureKind::ByRef); bool isPtrTy = checkIfPointerMap(mapOp); // Currently handles array sectioning lowerbound case, but more @@ -2824,7 +2870,7 @@ createAlteredByCaptureMap(MapInfoData &mapData, // function mimics some of the logic from Clang that we require for // kernel argument passing from host -> device. switch (captureKind) { - case mlir::omp::VariableCaptureKind::ByRef: { + case omp::VariableCaptureKind::ByRef: { llvm::Value *newV = mapData.Pointers[i]; std::vector offsetIdx = calculateBoundsOffset( moduleTranslation, builder, mapData.BaseType[i]->isArrayTy(), @@ -2837,7 +2883,7 @@ createAlteredByCaptureMap(MapInfoData &mapData, "array_offset"); mapData.Pointers[i] = newV; } break; - case mlir::omp::VariableCaptureKind::ByCopy: { + case omp::VariableCaptureKind::ByCopy: { llvm::Type *type = mapData.BaseType[i]; llvm::Value *newV; if (mapData.Pointers[i]->getType()->isPointerTy()) @@ -2859,8 +2905,8 @@ createAlteredByCaptureMap(MapInfoData &mapData, mapData.Pointers[i] = newV; mapData.BasePointers[i] = newV; } break; - case mlir::omp::VariableCaptureKind::This: - case mlir::omp::VariableCaptureKind::VLAType: + case omp::VariableCaptureKind::This: + case omp::VariableCaptureKind::VLAType: mapData.MapClause[i]->emitOpError("Unhandled capture kind"); break; } @@ -2873,10 +2919,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl, llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, - MapInfoData &mapData, - const SmallVector &useDevicePtrVars = {}, - const SmallVector &useDeviceAddrVars = {}, - bool isTargetParams = false) { + MapInfoData &mapData, bool isTargetParams = false) { // We wish to modify some of the methods in which arguments are // passed based on their capture type by the target region, this can // involve generating new loads and stores, which changes the @@ -2893,15 +2936,6 @@ static void genMapInfos(llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); - auto fail = [&combinedInfo]() -> void { - combinedInfo.BasePointers.clear(); - combinedInfo.Pointers.clear(); - combinedInfo.DevicePointers.clear(); - combinedInfo.Sizes.clear(); - combinedInfo.Types.clear(); - combinedInfo.Names.clear(); - }; - // We operate under the assumption that all vectors that are // required in MapInfoData are of equal lengths (either filled with // default constructed data or appropiate information) so we can @@ -2913,7 +2947,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder, if (mapData.IsAMember[i]) continue; - auto mapInfoOp = mlir::dyn_cast(mapData.MapClause[i]); + auto mapInfoOp = dyn_cast(mapData.MapClause[i]); if (!mapInfoOp.getMembers().empty()) { processMapWithMembersOf(moduleTranslation, builder, *ompBuilder, dl, combinedInfo, mapData, i, isTargetParams); @@ -2922,46 +2956,6 @@ static void genMapInfos(llvm::IRBuilderBase &builder, processIndividualMap(mapData, i, combinedInfo, isTargetParams); } - - auto findMapInfo = [&combinedInfo](llvm::Value *val, unsigned &index) { - index = 0; - for (llvm::Value *basePtr : combinedInfo.BasePointers) { - if (basePtr == val) - return true; - index++; - } - return false; - }; - - auto addDevInfos = [&, fail](auto useDeviceVars, auto devOpType) -> void { - for (const auto &useDeviceVar : useDeviceVars) { - // TODO: Only LLVMPointerTypes are handled. - if (!isa(useDeviceVar.getType())) - return fail(); - - llvm::Value *mapOpValue = moduleTranslation.lookupValue(useDeviceVar); - - // Check if map info is already present for this entry. - unsigned infoIndex; - if (findMapInfo(mapOpValue, infoIndex)) { - combinedInfo.Types[infoIndex] |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; - combinedInfo.DevicePointers[infoIndex] = devOpType; - } else { - combinedInfo.BasePointers.emplace_back(mapOpValue); - combinedInfo.Pointers.emplace_back(mapOpValue); - combinedInfo.DevicePointers.emplace_back(devOpType); - combinedInfo.Names.emplace_back( - LLVM::createMappingInformation(useDeviceVar.getLoc(), *ompBuilder)); - combinedInfo.Types.emplace_back( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM); - combinedInfo.Sizes.emplace_back(builder.getInt64(0)); - } - } - }; - - addDevInfos(useDevicePtrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer); - addDevInfos(useDeviceAddrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Address); } static LogicalResult @@ -3058,19 +3052,15 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; MapInfoData mapData; - collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, DL, builder); + collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, DL, + builder, useDevicePtrVars, useDeviceAddrVars); // Fill up the arrays with all the mapped variables. llvm::OpenMPIRBuilder::MapInfosTy combinedInfo; auto genMapInfoCB = [&](InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::MapInfosTy & { builder.restoreIP(codeGenIP); - if (auto dataOp = dyn_cast(op)) { - genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData, - useDevicePtrVars, useDeviceAddrVars); - } else { - genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData); - } + genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData); return combinedInfo; }; @@ -3089,21 +3079,21 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, if (!info.DevicePtrInfoMap.empty()) { builder.restoreIP(codeGenIP); unsigned argIndex = 0; - for (auto &devPtrOp : useDevicePtrVars) { - llvm::Value *mapOpValue = moduleTranslation.lookupValue(devPtrOp); - const auto &arg = region.front().getArgument(argIndex); - moduleTranslation.mapValue(arg, - info.DevicePtrInfoMap[mapOpValue].second); - argIndex++; - } - - for (auto &devAddrOp : useDeviceAddrVars) { - llvm::Value *mapOpValue = moduleTranslation.lookupValue(devAddrOp); - const auto &arg = region.front().getArgument(argIndex); - auto *LI = builder.CreateLoad( - builder.getPtrTy(), info.DevicePtrInfoMap[mapOpValue].second); - moduleTranslation.mapValue(arg, LI); - argIndex++; + for (auto [basePointer, devicePointer] : llvm::zip_equal( + combinedInfo.BasePointers, combinedInfo.DevicePointers)) { + if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer) { + const auto &arg = region.front().getArgument(argIndex); + moduleTranslation.mapValue( + arg, info.DevicePtrInfoMap[basePointer].second); + argIndex++; + } else if (devicePointer == + llvm::OpenMPIRBuilder::DeviceInfoTy::Address) { + const auto &arg = region.front().getArgument(argIndex); + auto *loadInst = builder.CreateLoad( + builder.getPtrTy(), info.DevicePtrInfoMap[basePointer].second); + moduleTranslation.mapValue(arg, loadInst); + argIndex++; + } } bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region", @@ -3116,6 +3106,20 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, // If device info is available then region has already been generated if (info.DevicePtrInfoMap.empty()) { builder.restoreIP(codeGenIP); + // For device pass, if use_device_ptr(addr) mappings were present, + // we need to link them here before codegen. + if (ompBuilder->Config.IsTargetDevice.value_or(false)) { + unsigned argIndex = 0; + for (auto [basePointer, devicePointer] : + llvm::zip_equal(mapData.BasePointers, mapData.DevicePointers)) { + if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer || + devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Address) { + const auto &arg = region.front().getArgument(argIndex); + moduleTranslation.mapValue(arg, basePointer); + argIndex++; + } + } + } bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region", builder, moduleTranslation); } @@ -3335,17 +3339,14 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, llvm::IRBuilderBase::InsertPoint codeGenIP) { builder.restoreIP(allocaIP); - mlir::omp::VariableCaptureKind capture = - mlir::omp::VariableCaptureKind::ByRef; + omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef; // Find the associated MapInfoData entry for the current input for (size_t i = 0; i < mapData.MapClause.size(); ++i) if (mapData.OriginalValue[i] == input) { - if (auto mapOp = mlir::dyn_cast_if_present( - mapData.MapClause[i])) { - capture = mapOp.getMapCaptureType().value_or( - mlir::omp::VariableCaptureKind::ByRef); - } + auto mapOp = cast(mapData.MapClause[i]); + capture = + mapOp.getMapCaptureType().value_or(omp::VariableCaptureKind::ByRef); break; } @@ -3366,18 +3367,18 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, builder.restoreIP(codeGenIP); switch (capture) { - case mlir::omp::VariableCaptureKind::ByCopy: { + case omp::VariableCaptureKind::ByCopy: { retVal = v; break; } - case mlir::omp::VariableCaptureKind::ByRef: { + case omp::VariableCaptureKind::ByRef: { retVal = builder.CreateAlignedLoad( v->getType(), v, ompBuilder.M.getDataLayout().getPrefTypeAlign(v->getType())); break; } - case mlir::omp::VariableCaptureKind::This: - case mlir::omp::VariableCaptureKind::VLAType: + case omp::VariableCaptureKind::This: + case omp::VariableCaptureKind::VLAType: assert(false && "Currently unsupported capture kind"); break; } @@ -3429,8 +3430,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(codeGenIP); unsigned argIndex = 0; for (auto &mapOp : mapVars) { - auto mapInfoOp = - mlir::dyn_cast(mapOp.getDefiningOp()); + auto mapInfoOp = cast(mapOp.getDefiningOp()); llvm::Value *mapOpValue = moduleTranslation.lookupValue(mapInfoOp.getVarPtr()); const auto &arg = targetRegion.front().getArgument(argIndex); @@ -3458,14 +3458,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, findAllocaInsertPoint(builder, moduleTranslation); MapInfoData mapData; - collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, dl, builder); + collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl, + builder); llvm::OpenMPIRBuilder::MapInfosTy combinedInfos; auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::MapInfosTy & { builder.restoreIP(codeGenIP); - genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, {}, {}, - true); + genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, true); return combinedInfos; }; diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir index bf9fa183bfb80..458d2f28a78f8 100644 --- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir @@ -209,7 +209,8 @@ llvm.func @_QPopenmp_target_use_dev_ptr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} - omp.target_data map_entries(%map1 : !llvm.ptr) use_device_ptr(%a : !llvm.ptr) { + %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %1 = llvm.mlir.constant(10 : i32) : i32 %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr @@ -253,7 +254,8 @@ llvm.func @_QPopenmp_target_use_dev_addr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} - omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr) { + %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %1 = llvm.mlir.constant(10 : i32) : i32 %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr @@ -295,7 +297,8 @@ llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr %map = omp.map.info var_ptr(%a : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} - omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr) { + %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %1 = llvm.mlir.constant(10 : i32) : i32 llvm.store %1, %arg0 : i32, !llvm.ptr @@ -337,7 +340,8 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() { %1 = llvm.mlir.constant(1 : i64) : i64 %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} - omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr) { + %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %2 = llvm.mlir.constant(10 : i32) : i32 %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr @@ -394,7 +398,9 @@ llvm.func @_QPopenmp_target_use_dev_both() { %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} - omp.target_data map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_ptr(%a : !llvm.ptr) use_device_addr(%b : !llvm.ptr) { + %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr) use_device_addr(%map3 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): %2 = llvm.mlir.constant(10 : i32) : i32 %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr diff --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir new file mode 100644 index 0000000000000..f094a46581dee --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir @@ -0,0 +1,39 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// This tests check that target code nested inside a target data region which +// has only use_device_ptr mapping corectly generates code on the device pass. + +// CHECK: define weak_odr protected void @__omp_offloading{{.*}}main_ +// CHECK-NEXT: entry: +// CHECK-NEXT: %[[VAL_3:.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr %[[VAL_4:.*]], ptr %[[VAL_3]], align 8 +// CHECK-NEXT: %[[VAL_5:.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_{{.*}}_kernel_environment, ptr %[[VAL_6:.*]]) +// CHECK-NEXT: %[[VAL_7:.*]] = icmp eq i32 %[[VAL_5]], -1 +// CHECK-NEXT: br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]] +// CHECK: user_code.entry: ; preds = %[[VAL_10:.*]] +// CHECK-NEXT: %[[VAL_11:.*]] = load ptr, ptr %[[VAL_3]], align 8 +// CHECK-NEXT: br label %[[VAL_12:.*]] +// CHECK: omp.target: ; preds = %[[VAL_8]] +// CHECK-NEXT: %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8 +// CHECK-NEXT: store i32 999, ptr %[[VAL_13]], align 4 +// CHECK-NEXT: br label %[[VAL_14:.*]] +module attributes {omp.is_target_device = true } { + llvm.func @_QQmain() attributes {fir.bindc_name = "main"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr + %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data use_device_ptr(%map : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr): + %map1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%map1 : !llvm.ptr){ + ^bb0(%arg1: !llvm.ptr): + %1 = llvm.mlir.constant(999 : i32) : i32 + %2 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr + llvm.store %1, %2 : i32, !llvm.ptr + omp.terminator + } + omp.terminator + } + llvm.return + } +} From 7330c9b033f29bd92b99db6282c0f71de64122ab Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 4 Sep 2024 19:46:27 +0800 Subject: [PATCH 082/425] [flang] Fix -Wunneeded-internal-declaration in DirectivesCommon.h (NFC) /llvm-project/flang/lib/Lower/DirectivesCommon.h:910:1: error: 'static' function 'peelOuterConvert' declared in header file should be declared 'static inline' [-Werror,-Wunneeded-internal-declaration] peelOuterConvert(Fortran::semantics::SomeExpr &expr) { ^ 1 error generated. --- flang/lib/Lower/DirectivesCommon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h index d8b1f1f3e4362..d2060e77ce530 100644 --- a/flang/lib/Lower/DirectivesCommon.h +++ b/flang/lib/Lower/DirectivesCommon.h @@ -906,7 +906,7 @@ struct PeelConvert { } }; -static Fortran::semantics::SomeExpr +static inline Fortran::semantics::SomeExpr peelOuterConvert(Fortran::semantics::SomeExpr &expr) { if (auto peeled = PeelConvert::visit(expr)) return *peeled; From d94199c8ffba3f3895da7627d3dbbca62937310c Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Wed, 4 Sep 2024 13:49:48 +0200 Subject: [PATCH 083/425] [clang] Make lifetimebound and GSL analysis more coherent (#105884) This allows clang to detect more use-after-free bugs (shown in the #100549). This relands the remaining change (removing the EnableLifetimeWarnings flag) in https://github.com/llvm/llvm-project/pull/104906, with a proper fix for the regression. Fixes #100549 --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/CheckExprLifetime.cpp | 154 +++++++----------- .../Sema/warn-lifetime-analysis-nocfg.cpp | 26 +++ 3 files changed, 87 insertions(+), 96 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 251eb4c1c4559..bcdbc5b702765 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -270,6 +270,9 @@ Improvements to Clang's diagnostics - Clang now respects lifetimebound attribute for the assignment operator parameter. (#GH106372). +- The lifetimebound and GSL analysis in clang are coherent, allowing clang to + detect more use-after-free bugs. (#GH100549). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index f7540a6e3a897..1482711cc2839 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -237,13 +237,11 @@ static bool pathContainsInit(IndirectLocalPath &Path) { static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits, - bool EnableLifetimeWarnings); + bool RevisitSubinits); static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit, - bool EnableLifetimeWarnings); + LocalVisitor Visit); template static bool isRecordWithAttr(QualType Type) { if (auto *RD = Type->getAsCXXRecordDecl()) @@ -369,8 +367,7 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { // Visit lifetimebound or gsl-pointer arguments. static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit, - bool EnableLifetimeWarnings) { + LocalVisitor Visit) { const FunctionDecl *Callee; ArrayRef Args; @@ -385,6 +382,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, if (!Callee) return; + bool EnableGSLAnalysis = !Callee->getASTContext().getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer, SourceLocation()); Expr *ObjectArg = nullptr; if (isa(Call) && Callee->isCXXInstanceMember()) { ObjectArg = Args[0]; @@ -397,11 +396,9 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D}); if (Arg->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit, - /*EnableLifetimeWarnings=*/false); + Visit); else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings=*/false); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); Path.pop_back(); }; auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { @@ -412,7 +409,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, // Once we initialized a value with a reference, it can no longer dangle. if (!Value) { for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { - if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) + if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit || + PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall) continue; if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) @@ -425,11 +423,9 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, Arg, D}); if (Arg->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit, - /*EnableLifetimeWarnings=*/true); + Visit); else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings=*/true); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); Path.pop_back(); }; @@ -452,7 +448,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, CheckCoroObjArg = false; if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg) VisitLifetimeBoundArg(Callee, ObjectArg); - else if (EnableLifetimeWarnings) { + else if (EnableGSLAnalysis) { if (auto *CME = dyn_cast(Callee); CME && shouldTrackImplicitObjectArg(CME)) VisitGSLPointerArg(Callee, ObjectArg, @@ -465,15 +461,15 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, I != N; ++I) { if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); - else if (EnableLifetimeWarnings && I == 0) { + else if (EnableGSLAnalysis && I == 0) { if (shouldTrackFirstArgument(Callee)) { VisitGSLPointerArg(Callee, Args[0], !Callee->getReturnType()->isReferenceType()); - } else { - if (auto *CCE = dyn_cast(Call); - CCE && CCE->getConstructor()->getParent()->hasAttr()) - VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0], - true); + } else if (auto *CCE = dyn_cast(Call); + CCE && + CCE->getConstructor()->getParent()->hasAttr()) { + VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0], + true); } } } @@ -483,8 +479,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, /// glvalue expression \c Init. static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit, - bool EnableLifetimeWarnings) { + LocalVisitor Visit) { RevertToOldSizeRAII RAII(Path); // Walk past any constructs which we can lifetime-extend across. @@ -521,8 +516,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, else // We can't lifetime extend through this but we might still find some // retained temporaries. - return visitLocalsRetainedByInitializer(Path, Init, Visit, true, - EnableLifetimeWarnings); + return visitLocalsRetainedByInitializer(Path, Init, Visit, true); } // Step into CXXDefaultInitExprs so we can diagnose cases where a @@ -536,21 +530,18 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, if (auto *MTE = dyn_cast(Init)) { if (Visit(Path, Local(MTE), RK)) - visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true); } if (auto *M = dyn_cast(Init)) { // Lifetime of a non-reference type field is same as base object. if (auto *F = dyn_cast(M->getMemberDecl()); F && !F->getType()->isReferenceType()) - visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true); } if (isa(Init)) - return visitFunctionCallArguments(Path, Init, Visit, - EnableLifetimeWarnings); + return visitFunctionCallArguments(Path, Init, Visit); switch (Init->getStmtClass()) { case Stmt::DeclRefExprClass: { @@ -569,8 +560,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, } else if (VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); visitLocalsRetainedByReferenceBinding(Path, VD->getInit(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); } } break; @@ -582,15 +572,13 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, // handling all sorts of rvalues passed to a unary operator. const UnaryOperator *U = cast(Init); if (U->getOpcode() == UO_Deref) - visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true); break; } case Stmt::ArraySectionExprClass: { - visitLocalsRetainedByInitializer(Path, - cast(Init)->getBase(), - Visit, true, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer( + Path, cast(Init)->getBase(), Visit, true); break; } @@ -598,11 +586,9 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, case Stmt::BinaryConditionalOperatorClass: { auto *C = cast(Init); if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit, - EnableLifetimeWarnings); + visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit, - EnableLifetimeWarnings); + visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit); break; } @@ -625,8 +611,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, /// the prvalue expression \c Init. static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits, - bool EnableLifetimeWarnings) { + bool RevisitSubinits) { RevertToOldSizeRAII RAII(Path); Expr *Old; @@ -667,18 +652,16 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (VD && VD->getType().isConstQualified() && VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); - visitLocalsRetainedByInitializer( - Path, VD->getInit(), Visit, true, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, VD->getInit(), Visit, + true); } } else if (auto *MTE = dyn_cast(L)) { if (MTE->getType().isConstQualified()) visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), - Visit, true, - EnableLifetimeWarnings); + Visit, true); } return false; - }, - EnableLifetimeWarnings); + }); // We assume that objects can be retained by pointers cast to integers, // but not if the integer is cast to floating-point type or to _Complex. @@ -707,9 +690,8 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // Model array-to-pointer decay as taking the address of the array // lvalue. Path.push_back({IndirectLocalPathEntry::AddressOf, CE}); - return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + return visitLocalsRetainedByReferenceBinding( + Path, CE->getSubExpr(), RK_ReferenceBinding, Visit); default: return; @@ -724,8 +706,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // lifetime of the array exactly like binding a reference to a temporary. if (auto *ILE = dyn_cast(Init)) return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(), - RK_StdInitializerList, Visit, - EnableLifetimeWarnings); + RK_StdInitializerList, Visit); if (InitListExpr *ILE = dyn_cast(Init)) { // We already visited the elements of this initializer list while @@ -736,14 +717,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (ILE->isTransparent()) return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); if (ILE->getType()->isArrayType()) { for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I) visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); return; } @@ -756,14 +735,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (RD->isUnion() && ILE->getInitializedFieldInUnion() && ILE->getInitializedFieldInUnion()->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); else { unsigned Index = 0; for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index) visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); for (const auto *I : RD->fields()) { if (Index >= ILE->getNumInits()) break; @@ -772,14 +749,13 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *SubInit = ILE->getInit(Index); if (I->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, SubInit, - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); else // This might be either aggregate-initialization of a member or // initialization of a std::initializer_list object. Regardless, // we should recursively lifetime-extend that initializer. - visitLocalsRetainedByInitializer( - Path, SubInit, Visit, RevisitSubinits, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, SubInit, Visit, + RevisitSubinits); ++Index; } } @@ -800,10 +776,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap}); if (E->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding, - Visit, EnableLifetimeWarnings); + Visit); else - visitLocalsRetainedByInitializer(Path, E, Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, E, Visit, true); if (Cap.capturesVariable()) Path.pop_back(); } @@ -817,16 +792,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Arg = MTE->getSubExpr(); Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg, CCE->getConstructor()}); - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings*/ false); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); Path.pop_back(); } } } if (isa(Init) || isa(Init)) - return visitFunctionCallArguments(Path, Init, Visit, - EnableLifetimeWarnings); + return visitFunctionCallArguments(Path, Init, Visit); switch (Init->getStmtClass()) { case Stmt::UnaryOperatorClass: { @@ -842,8 +815,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::AddressOf, UO}); visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); } break; } @@ -856,11 +828,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, break; if (BO->getLHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true); else if (BO->getRHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true); break; } @@ -870,11 +840,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // In C++, we can have a throw-expression operand, which has 'void' type // and isn't interesting from a lifetime perspective. if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true); break; } @@ -992,8 +960,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, const InitializedEntity *ExtendingEntity, LifetimeKind LK, - const AssignedEntity *AEntity, Expr *Init, - bool EnableLifetimeWarnings) { + const AssignedEntity *AEntity, Expr *Init) { assert((AEntity && LK == LK_Assignment) || (InitEntity && LK != LK_Assignment)); // If this entity doesn't have an interesting lifetime, don't bother looking @@ -1292,13 +1259,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef, if (Init->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding, - TemporaryVisitor, - EnableLifetimeWarnings); + TemporaryVisitor); else visitLocalsRetainedByInitializer( Path, Init, TemporaryVisitor, // Don't revisit the sub inits for the intialization case. - /*RevisitSubinits=*/!InitEntity, EnableLifetimeWarnings); + /*RevisitSubinits=*/!InitEntity); } void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, @@ -1306,16 +1272,12 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, auto LTResult = getEntityLifetime(&Entity); LifetimeKind LK = LTResult.getInt(); const InitializedEntity *ExtendingEntity = LTResult.getPointer(); - bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( - diag::warn_dangling_lifetime_pointer, SourceLocation()); checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, - /*AEntity*/ nullptr, Init, EnableLifetimeWarnings); + /*AEntity*/ nullptr, Init); } void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, Expr *Init) { - bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( - diag::warn_dangling_lifetime_pointer, SourceLocation()); bool EnableDanglingPointerAssignment = !SemaRef.getDiagnostics().isIgnored( diag::warn_dangling_pointer_assignment, SourceLocation()); bool RunAnalysis = (EnableDanglingPointerAssignment && @@ -1327,7 +1289,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr, /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity, - Init, EnableLifetimeWarnings); + Init); } } // namespace clang::sema diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index cd1904db32710..67d1ceaa02d03 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -498,4 +498,30 @@ std::string_view test2(int i, std::optional a) { return std::move(*a); return std::move(a.value()); } + +struct Foo; +struct FooView { + FooView(const Foo& foo [[clang::lifetimebound]]); +}; +FooView test3(int i, std::optional a) { + if (i) + return *a; // expected-warning {{address of stack memory}} + return a.value(); // expected-warning {{address of stack memory}} +} +} // namespace GH93386 + +namespace GH100549 { +struct UrlAnalyzed { + UrlAnalyzed(std::string_view url [[clang::lifetimebound]]); +}; +std::string StrCat(std::string_view, std::string_view); +void test1() { + UrlAnalyzed url(StrCat("abc", "bcd")); // expected-warning {{object backing the pointer will be destroyed}} +} + +std::string_view ReturnStringView(std::string_view abc [[clang::lifetimebound]]); + +void test() { + std::string_view svjkk1 = ReturnStringView(StrCat("bar", "x")); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}} } +} // namespace GH100549 From af1e59aea2ea7d07ece1f34621dda38c995826a3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 3 Sep 2024 15:35:40 -0700 Subject: [PATCH 084/425] [SLP]Fix PR107037: correctly track origonal/modified after vectorizations reduced values Need to correctly track reduced values with multiple uses in the same reduction emission attempt. Otherwise, the number of the reuses might be calculated incorrectly, and may cause compiler crash. Fixes https://github.com/llvm/llvm-project/issues/107037 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 69 ++++++++++--------- .../X86/multi-tracked-reduced-value.ll | 47 +++++++++++++ 2 files changed, 85 insertions(+), 31 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a7272deb3c34f..19b95cf473e91 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17721,6 +17721,12 @@ class HorizontalReduction { for (Value *V : Candidates) TrackedVals.try_emplace(V, V); + auto At = [](MapVector &MV, Value *V) -> unsigned & { + auto *It = MV.find(V); + assert(It != MV.end() && "Unable to find given key."); + return It->second; + }; + DenseMap VectorizedVals(ReducedVals.size()); // List of the values that were reduced in other trees as part of gather // nodes and thus requiring extract if fully vectorized in other trees. @@ -17738,7 +17744,7 @@ class HorizontalReduction { Candidates.reserve(2 * OrigReducedVals.size()); DenseMap TrackedToOrig(2 * OrigReducedVals.size()); for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { - Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; + Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]); // Check if the reduction value was not overriden by the extractelement // instruction because of the vectorization and exclude it, if it is not // compatible with other values. @@ -17757,7 +17763,7 @@ class HorizontalReduction { I + 1 < E) { SmallVector CommonCandidates(Candidates); for (Value *RV : ReducedVals[I + 1]) { - Value *RdxVal = TrackedVals.find(RV)->second; + Value *RdxVal = TrackedVals.at(RV); // Check if the reduction value was not overriden by the // extractelement instruction because of the vectorization and // exclude it, if it is not compatible with other values. @@ -17778,10 +17784,12 @@ class HorizontalReduction { // Emit code for constant values. if (Candidates.size() > 1 && allConstant(Candidates)) { Value *Res = Candidates.front(); - ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); + Value *OrigV = TrackedToOrig.at(Candidates.front()); + ++VectorizedVals.try_emplace(OrigV).first->getSecond(); for (Value *VC : ArrayRef(Candidates).drop_front()) { Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps); - ++VectorizedVals.try_emplace(VC, 0).first->getSecond(); + Value *OrigV = TrackedToOrig.at(VC); + ++VectorizedVals.try_emplace(OrigV).first->getSecond(); if (auto *ResI = dyn_cast(Res)) V.analyzedReductionRoot(ResI); } @@ -17802,8 +17810,10 @@ class HorizontalReduction { // Gather same values. MapVector SameValuesCounter; if (IsSupportedHorRdxIdentityOp) - for (Value *V : Candidates) - ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; + for (Value *V : Candidates) { + Value *OrigV = TrackedToOrig.at(V); + ++SameValuesCounter.try_emplace(OrigV).first->second; + } // Used to check if the reduced values used same number of times. In this // case the compiler may produce better code. E.g. if reduced values are // aabbccdd (8 x values), then the first node of the tree will have a node @@ -17827,12 +17837,12 @@ class HorizontalReduction { }); Candidates.resize(SameValuesCounter.size()); transform(SameValuesCounter, Candidates.begin(), - [](const auto &P) { return P.first; }); + [&](const auto &P) { return TrackedVals.at(P.first); }); NumReducedVals = Candidates.size(); // Have a reduction of the same element. if (NumReducedVals == 1) { - Value *OrigV = TrackedToOrig.find(Candidates.front())->second; - unsigned Cnt = SameValuesCounter.lookup(OrigV); + Value *OrigV = TrackedToOrig.at(Candidates.front()); + unsigned Cnt = At(SameValuesCounter, OrigV); Value *RedVal = emitScaleForReusedOps(Candidates.front(), Builder, Cnt); VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); @@ -17936,8 +17946,8 @@ class HorizontalReduction { if (Cnt >= Pos && Cnt < Pos + ReduxWidth) continue; Value *V = Candidates[Cnt]; - Value *OrigV = TrackedToOrig.find(V)->second; - ++SameValuesCounter[OrigV]; + Value *OrigV = TrackedToOrig.at(V); + ++SameValuesCounter.try_emplace(OrigV).first->second; } } SmallPtrSet VLScalars(VL.begin(), VL.end()); @@ -17955,10 +17965,10 @@ class HorizontalReduction { LocalExternallyUsedValues[RdxVal]; continue; } - Value *OrigV = TrackedToOrig.find(RdxVal)->second; + Value *OrigV = TrackedToOrig.at(RdxVal); unsigned NumOps = - VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; - if (NumOps != ReducedValsToOps.find(OrigV)->second.size()) + VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV); + if (NumOps != ReducedValsToOps.at(OrigV).size()) LocalExternallyUsedValues[RdxVal]; } // Do not need the list of reused scalars in regular mode anymore. @@ -17983,9 +17993,8 @@ class HorizontalReduction { break; if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "HorSLPNotBeneficial", - ReducedValsToOps.find(VL[0])->second.front()) + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + ReducedValsToOps.at(VL[0]).front()) << "Vectorizing horizontal reduction is possible " << "but not beneficial with cost " << ore::NV("Cost", Cost) << " and threshold " @@ -17999,9 +18008,8 @@ class HorizontalReduction { LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemark( - SV_NAME, "VectorizedHorizontalReduction", - ReducedValsToOps.find(VL[0])->second.front()) + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + ReducedValsToOps.at(VL[0]).front()) << "Vectorized horizontal reduction with cost " << ore::NV("Cost", Cost) << " and with tree size " << ore::NV("TreeSize", V.getTreeSize()); @@ -18083,12 +18091,12 @@ class HorizontalReduction { VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { - Value *OrigV = TrackedToOrig.find(RdxVal)->second; + Value *OrigV = TrackedToOrig.at(RdxVal); if (IsSupportedHorRdxIdentityOp) { - VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]); + VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV)); continue; } - ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); + ++VectorizedVals.try_emplace(OrigV).first->getSecond(); if (!V.isVectorized(RdxVal)) RequiredExtract.insert(RdxVal); } @@ -18099,10 +18107,10 @@ class HorizontalReduction { } if (OptReusedScalars && !AnyVectorized) { for (const std::pair &P : SameValuesCounter) { - Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second); + Value *RdxVal = TrackedVals.at(P.first); + Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second); VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); - Value *OrigV = TrackedToOrig.find(P.first)->second; - VectorizedVals.try_emplace(OrigV, P.second); + VectorizedVals.try_emplace(P.first, P.second); } continue; } @@ -18190,8 +18198,7 @@ class HorizontalReduction { continue; unsigned NumOps = VectorizedVals.lookup(RdxVal); for (Instruction *RedOp : - ArrayRef(ReducedValsToOps.find(RdxVal)->second) - .drop_back(NumOps)) + ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps)) ExtraReductions.emplace_back(RedOp, RdxVal); } } @@ -18430,7 +18437,7 @@ class HorizontalReduction { // root = mul prev_root, <1, 1, n, 1> SmallVector Vals; for (Value *V : VL) { - unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); } auto *Scale = ConstantVector::get(Vals); @@ -18468,7 +18475,7 @@ class HorizontalReduction { bool NeedShuffle = false; for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { Value *V = VL[I]; - unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); if (Cnt % 2 == 0) { Mask[I] = VF; NeedShuffle = true; @@ -18488,7 +18495,7 @@ class HorizontalReduction { // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> SmallVector Vals; for (Value *V : VL) { - unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); Vals.push_back(ConstantFP::get(V->getType(), Cnt)); } auto *Scale = ConstantVector::get(Vals); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll new file mode 100644 index 0000000000000..e012cc60960b3 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i8 @test() { +; CHECK-LABEL: define i8 @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 0 to i8 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 0 to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 0 to i8 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 0 to i8 +; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i8 [[TMP4]], [[TMP0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[OP_RDX]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX3]], [[TMP3]] +; CHECK-NEXT: ret i8 [[OP_RDX4]] +; +entry: + %0 = trunc i32 0 to i8 + %1 = add i8 %0, 0 + %2 = add i8 %0, 0 + %3 = add i8 %0, 0 + %4 = add i8 %0, 0 + %5 = trunc i32 0 to i8 + %6 = or i8 %5, %0 + %7 = or i8 %6, %2 + %8 = or i8 %7, %3 + %9 = or i8 %8, %0 + %10 = or i8 %9, %4 + %conv4 = or i8 %10, %1 + %11 = trunc i32 0 to i8 + %12 = add i8 %11, 0 + %conv7 = or i8 %conv4, %12 + %13 = add i8 %11, 0 + %14 = add i8 %11, 0 + %15 = add i8 %11, 0 + %16 = trunc i32 0 to i8 + %17 = or i8 %13, %16 + %18 = or i8 %17, %14 + %19 = or i8 %18, %11 + %20 = or i8 %19, %15 + %conv5 = or i8 %20, %conv7 + %21 = trunc i32 0 to i8 + %conv6 = or i8 %21, %conv5 + ret i8 %conv6 +} From bb1b368e0ad3da98b4c51018bdbcd6a83d8e646d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 4 Sep 2024 13:07:57 +0100 Subject: [PATCH 085/425] [AArch64] Implement intrinsics for SVE FAMIN/FAMAX (#99042) This patch implements the following intrinsics: * Floating-point absolute maximum (predicated) svfloat16_t svamax[_f16]_m(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svamax[_f16]_x(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svamax[_f16]_z(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svamax[_n_f16]_m(svbool_t, svfloat16_t, float16_t); svfloat16_t svamax[_n_f16]_x(svbool_t, svfloat16_t, float16_t); svfloat16_t svamax[_n_f16]_z(svbool_t, svfloat16_t, float16_t); * Floating-point absolute minimum (predicated) svfloat16_t svmin[_f16]_m(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svmin[_f16]_x(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svmin[_f16]_z(svbool_t, svfloat16_t, svfloat16_t); svfloat16_t svmin[_n_f16]_m(svbool_t, svfloat16_t, float16_t); svfloat16_t svmin[_n_f16]_x(svbool_t, svfloat16_t, float16_t); svfloat16_t svmin[_n_f16]_z(svbool_t, svfloat16_t, float16_t); All the intrinsics have also variants for `f32` and `f64`, and have the `__arm_streaming` attribute. (cf. https://github.com/ARM-software/acle/pull/324) --- clang/include/clang/Basic/arm_sve.td | 7 +- .../acle_sve2_faminmax.c | 775 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 7 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +- .../AArch64/sve2-intrinsics-faminmax.ll | 115 +++ 5 files changed, 908 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 81527d8b98760..a0f12e1bbd2d4 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2418,4 +2418,9 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; -} \ No newline at end of file +} + +let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { + defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; + defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; +} diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c new file mode 100644 index 0000000000000..3cf7d99d606f3 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c @@ -0,0 +1,775 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include "arm_sme.h" +#else +#include "arm_sve.h" +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local @test_famin_f16_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f16_mu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famin_f16_m(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f16, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f16_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f16_xu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famin_f16_x(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f16, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f16_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f16_zu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat16_t test_famin_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f16, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f16_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f16_mu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famin_n_f16_m(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f16, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f16_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f16_xu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famin_n_f16_x(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f16, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f16_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f16_zu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv8f16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat16_t test_famin_n_f16_z(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f16, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f32_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f32_mu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famin_f32_m(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f32, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f32_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f32_xu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famin_f32_x(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f32, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f32_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f32_zu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat32_t test_famin_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f32, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f32_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f32_mu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famin_n_f32_m(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f32, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f32_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f32_xu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famin_n_f32_x(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f32, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f32_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f32_zu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv4f32( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat32_t test_famin_n_f32_z(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f32, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f64_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f64_mu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famin_f64_m(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f64, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f64_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f64_xu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famin_f64_x(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f64, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_f64_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famin_f64_zu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat64_t test_famin_f64_z(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _f64, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f64_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f64_mu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famin_n_f64_m(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f64, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f64_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f64_xu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famin.u.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famin_n_f64_x(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f64, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famin_n_f64_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famin_n_f64_zu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famin.nxv2f64( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat64_t test_famin_n_f64_z(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamin, _n_f64, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f16_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f16_mu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famax_f16_m(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f16, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f16_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f16_xu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv8f16( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famax_f16_x(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f16, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f16_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f16_zu10__SVBool_tu13__SVFloat16_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat16_t test_famax_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f16, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f16_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f16_mu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famax_n_f16_m(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f16, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f16_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f16_xu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv8f16( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat16_t test_famax_n_f16_x(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f16, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f16_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f16_zu10__SVBool_tu13__SVFloat16_tDh( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv8f16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat16_t test_famax_n_f16_z(svbool_t pg, svfloat16_t a, float16_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f16, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f32_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f32_mu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famax_f32_m(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f32, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f32_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f32_xu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv4f32( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famax_f32_x(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f32, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f32_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f32_zu10__SVBool_tu13__SVFloat32_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat32_t test_famax_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f32, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f32_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f32_mu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famax_n_f32_m(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f32, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f32_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f32_xu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv4f32( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat32_t test_famax_n_f32_x(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f32, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f32_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f32_zu10__SVBool_tu13__SVFloat32_tf( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv4f32( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat32_t test_famax_n_f32_z(svbool_t pg, svfloat32_t a, float32_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f32, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f64_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f64_mu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famax_f64_m(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f64, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f64_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f64_xu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv2f64( [[TMP0]], [[A]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famax_f64_x(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f64, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_f64_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z16test_famax_f64_zu10__SVBool_tu13__SVFloat64_tS0_( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[TMP1]], [[B]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat64_t test_famax_f64_z(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _f64, _z)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f64_m( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f64_mu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famax_n_f64_m(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f64, _m)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f64_x( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f64_xu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.famax.u.nxv2f64( [[TMP0]], [[A]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP1]] +// +svfloat64_t test_famax_n_f64_x(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f64, _x)(pg, a, b); +} + +// CHECK-LABEL: define dso_local @test_famax_n_f64_z( +// CHECK-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CHECK-CPP-LABEL: define dso_local @_Z18test_famax_n_f64_zu10__SVBool_tu13__SVFloat64_td( +// CHECK-CPP-SAME: [[PG:%.*]], [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) +// CHECK-CPP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[B]], i64 0 +// CHECK-CPP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-CPP-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[A]], zeroinitializer +// CHECK-CPP-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.famax.nxv2f64( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-CPP-NEXT: ret [[TMP2]] +// +svfloat64_t test_famax_n_f64_z(svbool_t pg, svfloat64_t a, float64_t b) STREAMING { + return SVE_ACLE_FUNC(svamax, _n_f64, _z)(pg, a, b); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6727ee69d7b3e..9bce850750f79 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3785,3 +3785,10 @@ def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic; def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic; +// SVE2/SME2 - Floating point absolute maximum and minimum + +def int_aarch64_sve_famax : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_famax_u : AdvSIMD_Pred2VectorArg_Intrinsic; + +def int_aarch64_sve_famin : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index af8ddb49b0ac6..4922fb280333b 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4183,9 +4183,11 @@ defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>; } // End HasSVE2orSME2, HasFP8 let Predicates = [HasSVE2orSME2, HasFAMINMAX] in { -// FP8 Arithmetic - Predicated Group -defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "", null_frag, DestructiveOther>; -defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "", null_frag, DestructiveOther>; +defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "FAMIN_ZPZZ", int_aarch64_sve_famin, DestructiveBinaryComm>; +defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "FAMAX_ZPZZ", int_aarch64_sve_famax, DestructiveBinaryComm>; + +defm FAMAX_ZPZZ : sve_fp_bin_pred_hfd; +defm FAMIN_ZPZZ : sve_fp_bin_pred_hfd; } // End HasSVE2orSME2, HasFAMINMAX let Predicates = [HasSSVE_FP8FMA] in { diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll new file mode 100644 index 0000000000000..7d16f8383d968 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mattr=+sve2 < %s | FileCheck %s +; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define @famin_f16( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.nxv8f16( %pg, %a, %b) + ret %r +} + +define @famin_f32( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.nxv4f32( %pg, %a, %b) + ret %r +} + +define @famin_f64( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.nxv2f64( %pg, %a, %b) + ret %r +} + +define @famin_u_f16( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_u_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.u.nxv8f16( %pg, %b, %a) + ret %r +} + +define @famin_u_f32( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_u_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.u.nxv4f32( %pg, %b, %a) + ret %r +} + +define @famin_u_f64( %pg, %a, %b) #0 { +; CHECK-LABEL: famin_u_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: famin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famin.u.nxv2f64( %pg, %b, %a) + ret %r +} + +define @famax_f16( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.nxv8f16( %pg, %a, %b) + ret %r +} + +define @famax_f32( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.nxv4f32( %pg, %a, %b) + ret %r +} + +define @famax_f64( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.nxv2f64( %pg, %a, %b) + ret %r +} + +define @famax_u_f16( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_u_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.u.nxv8f16( %pg, %b, %a) + ret %r +} + +define @famax_u_f32( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_u_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.u.nxv4f32( %pg, %b, %a) + ret %r +} + +define @famax_u_f64( %pg, %a, %b) #0 { +; CHECK-LABEL: famax_u_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: famax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %r = call @llvm.aarch64.sve.famax.u.nxv2f64( %pg, %b, %a) + ret %r +} + +attributes #0 = { nounwind "target-features" = "+faminmax" } From c2b92a4250b3f514685676ba8985ea73450f14d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Chmiel?= Date: Wed, 4 Sep 2024 14:10:51 +0200 Subject: [PATCH 086/425] [SROA] Use SetVector for PromotableAllocas (#105809) Use SetVector to make operations more efficient if there is a very large number of allocas. --- llvm/lib/Transforms/Scalar/SROA.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 2310cb3a7decb..d0186da1bc5e2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -198,7 +198,9 @@ class SROA { SmallSetVector PostPromotionWorklist; /// A collection of alloca instructions we can directly promote. - std::vector PromotableAllocas; + SetVector, + SmallPtrSet, 16> + PromotableAllocas; /// A worklist of PHIs to speculate prior to promoting allocas. /// @@ -4799,9 +4801,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Finally, don't try to promote any allocas that new require re-splitting. // They have already been added to the worklist above. - llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) { - return ResplitPromotableAllocas.count(AI); - }); + PromotableAllocas.set_subtract(ResplitPromotableAllocas); return true; } @@ -4963,7 +4963,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, } if (PHIUsers.empty() && SelectUsers.empty()) { // Promote the alloca. - PromotableAllocas.push_back(NewAI); + PromotableAllocas.insert(NewAI); } else { // If we have either PHIs or Selects to speculate, add them to those // worklists and re-queue the new alloca so that we promote in on the @@ -5598,7 +5598,7 @@ bool SROA::promoteAllocas(Function &F) { LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n"); } else { LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC); + PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC); } PromotableAllocas.clear(); @@ -5615,7 +5615,7 @@ std::pair SROA::runSROA(Function &F) { if (AllocaInst *AI = dyn_cast(I)) { if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() && isAllocaPromotable(AI)) - PromotableAllocas.push_back(AI); + PromotableAllocas.insert(AI); else Worklist.insert(AI); } @@ -5639,10 +5639,9 @@ std::pair SROA::runSROA(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; - Worklist.remove_if(IsInSet); - PostPromotionWorklist.remove_if(IsInSet); - llvm::erase_if(PromotableAllocas, IsInSet); + Worklist.set_subtract(DeletedAllocas); + PostPromotionWorklist.set_subtract(DeletedAllocas); + PromotableAllocas.set_subtract(DeletedAllocas); DeletedAllocas.clear(); } } From d65ff3e9364536f9e0bd5f1c1bace626c256a2ad Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 4 Sep 2024 05:12:17 -0700 Subject: [PATCH 087/425] [SLP]Fix PR107198: add a check for empty complex type Need to check if the complex type is empty before trying to dig in, trying to find vectorizable type Fixes https://github.com/llvm/llvm-project/issues/107198 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++ llvm/test/Transforms/SLPVectorizer/empty-struct.ll | 14 ++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/empty-struct.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 19b95cf473e91..a2af7f4e1b01c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7780,6 +7780,8 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { Type *EltTy = T; while (isa(EltTy)) { + if (EltTy->isEmptyTy()) + return 0; if (auto *ST = dyn_cast(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) diff --git a/llvm/test/Transforms/SLPVectorizer/empty-struct.ll b/llvm/test/Transforms/SLPVectorizer/empty-struct.ll new file mode 100644 index 0000000000000..422d6b2eb3276 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/empty-struct.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define { { {}, {}, {}, {}, {}, {}, {} } } @test({ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } %0) { +; CHECK-LABEL: define { { {}, {}, {}, {}, {}, {}, {} } } @test( +; CHECK-SAME: { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } [[TMP0]], 18 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { { {}, {}, {}, {}, {}, {}, {} } } undef, { {}, {}, {}, {}, {}, {}, {} } [[TMP2]], 0 +; CHECK-NEXT: ret { { {}, {}, {}, {}, {}, {}, {} } } [[TMP3]] +; + %2 = extractvalue { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } %0, 18 + %3 = insertvalue { { {}, {}, {}, {}, {}, {}, {} } } undef, { {}, {}, {}, {}, {}, {}, {} } %2, 0 + ret { { {}, {}, {}, {}, {}, {}, {} } } %3 +} From 26645ae2eea00456d98b497f348426c375409ce4 Mon Sep 17 00:00:00 2001 From: Menooker Date: Wed, 4 Sep 2024 20:36:19 +0800 Subject: [PATCH 088/425] [mlir][memref] Fix hoist-static-allocs option of buffer-results-to-out-params when function parameters are returned (#102093) buffer-results-to-out-params pass will have a nullptr-referencing error when hoist-static-allocs option is on, when the return value of a function is a parameter of the function. This PR fixes this issue. --- .../Transforms/BufferResultsToOutParams.cpp | 3 ++- .../buffer-results-to-out-params-elim.mlir | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp index b19636adaa69e..b7755b2be8483 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp @@ -120,7 +120,8 @@ static LogicalResult updateReturnOps(func::FuncOp func, } OpBuilder builder(op); for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) { - if (hoistStaticAllocs && isa(orig.getDefiningOp()) && + if (hoistStaticAllocs && + isa_and_nonnull(orig.getDefiningOp()) && mlir::cast(orig.getType()).hasStaticShape()) { orig.replaceAllUsesWith(arg); orig.getDefiningOp()->erase(); diff --git a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir index f77dbfaa6cb11..2783836a09e16 100644 --- a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir +++ b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir @@ -34,4 +34,18 @@ func.func @basic_dynamic(%d: index) -> (memref) { %b = memref.alloc(%d) : memref "test.source"(%b) : (memref) -> () return %b : memref -} \ No newline at end of file +} + +// ----- + +// no change due to writing to func args +// CHECK-LABEL: func @return_arg( +// CHECK-SAME: %[[ARG0:.*]]: memref<128x256xf32>, %[[ARG1:.*]]: memref<128x256xf32>, %[[ARG2:.*]]: memref<128x256xf32>) { +// CHECK: "test.source"(%[[ARG0]], %[[ARG1]]) +// CHECK: memref.copy +// CHECK: return +// CHECK: } +func.func @return_arg(%arg0: memref<128x256xf32>, %arg1: memref<128x256xf32>) -> memref<128x256xf32> { + "test.source"(%arg0, %arg1) : (memref<128x256xf32>, memref<128x256xf32>) -> () + return %arg0 : memref<128x256xf32> +} From 6238159e886baca5ebf31fd6b15d79db30ced889 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 4 Sep 2024 20:46:21 +0800 Subject: [PATCH 089/425] [CodeGen][NewPM] Fix linker error due to dummy `MachineVerifierPass` (#107237) Forgot to remove the dummy registry in `MachinePassRegistry.def`. --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + llvm/include/llvm/Passes/MachinePassRegistry.def | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 64d20ae5b20ef..602e64f23ed51 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -45,6 +45,7 @@ #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/PHIElimination.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/RegAllocFast.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index b710b1c46f643..0423e20f58d30 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -228,7 +228,6 @@ DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass) DUMMY_MACHINE_FUNCTION_PASS("machineinstr-printer", MachineFunctionPrinterPass) DUMMY_MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass) -DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass) DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass) DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass) DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass) From df5840f9f09280a33923f119db5a82e0cda3622d Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 4 Sep 2024 15:58:17 +0300 Subject: [PATCH 090/425] [AMDGPU][Docs] Update product names for some targets (#106973) Based on https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus. --- llvm/docs/AMDGPUUsage.rst | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 80140734cbefd..a5ad3f6bbf7b2 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -392,12 +392,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following work-item Add product IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: + ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X + - tgsplit flat - AMD Instinct MI300A + - xnack scratch - kernarg preload - Packed - work-item Add product - IDs names. + work-item + IDs **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_ ----------------------------------------------------------------------------------------------------------------------- @@ -424,6 +424,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following ``gfx1030`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 6800 - wavefrontsize64 flat - *pal-amdhsa* - Radeon RX 6800 XT scratch - *pal-amdpal* - Radeon RX 6900 XT + - Radeon PRO W6800 + - Radeon PRO V620 ``gfx1031`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 6700 XT - wavefrontsize64 flat - *pal-amdhsa* scratch - *pal-amdpal* @@ -462,12 +464,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following **GCN GFX11 (RDNA 3)** [AMD-GCN-GFX11-RDNA3]_ ----------------------------------------------------------------------------------------------------------------------- - ``gfx1100`` ``amdgcn`` dGPU - cumode - Architected - *pal-amdpal* *TBA* - - wavefrontsize64 flat - scratch .. TODO:: - - Packed - work-item Add product - IDs names. + ``gfx1100`` ``amdgcn`` dGPU - cumode - Architected - *pal-amdpal* - Radeon PRO W7900 Dual Slot + - wavefrontsize64 flat - Radeon PRO W7900 + scratch - Radeon PRO W7800 + - Packed - Radeon RX 7900 XTX + work-item - Radeon RX 7900 XT + IDs - Radeon RX 7900 GRE ``gfx1101`` ``amdgcn`` dGPU - cumode - Architected *TBA* - wavefrontsize64 flat From 205f7ee737f75e666f70ad51bda5f778c02ab124 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 15:01:16 +0200 Subject: [PATCH 091/425] [Lint] Skip null args when checking noalias Do not emit a warning if there are two null noalias arguments, as they cannot be dereferenced anyway. This is a common pattern for `@.omp_outlined`, which has some optional noalias arguments. --- llvm/lib/Analysis/Lint.cpp | 3 ++- llvm/test/Analysis/Lint/noalias-null.ll | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Analysis/Lint/noalias-null.ll diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 00e430ce8e0ab..e0a029802bbd9 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -244,7 +244,8 @@ void Lint::visitCallBase(CallBase &I) { // dereferenced anyway. if (I.doesNotAccessMemory(ArgNo)) continue; - if (AI != BI && (*BI)->getType()->isPointerTy()) { + if (AI != BI && (*BI)->getType()->isPointerTy() && + !isa(*BI)) { AliasResult Result = AA->alias(*AI, *BI); Check(Result != AliasResult::MustAlias && Result != AliasResult::PartialAlias, diff --git a/llvm/test/Analysis/Lint/noalias-null.ll b/llvm/test/Analysis/Lint/noalias-null.ll new file mode 100644 index 0000000000000..5d2ecb5da5898 --- /dev/null +++ b/llvm/test/Analysis/Lint/noalias-null.ll @@ -0,0 +1,14 @@ +; RUN: opt < %s -passes=lint -disable-output 2>&1 | FileCheck --allow-empty %s + +declare void @foo(ptr noalias, ptr noalias) + +define void @test() { +entry: + call void @foo(ptr null, ptr null) + ret void +} + +; Lint should not complain about passing null to both arguments if they are +; null, since noalias only applies if the argument is written to, which is not +; possible for a null pointer. +; CHECK-NOT: Unusual: noalias argument aliases another argument From 3d53212f6104c27df5097301587ece69db9c007e Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 4 Sep 2024 15:03:36 +0200 Subject: [PATCH 092/425] [LLD][COFF] Initial support for ARM64EC importlibs. (#107164) Use demangled symbol name for __imp_ symbols and define demangled thunk symbol as AMD64 thunk. --- lld/COFF/InputFiles.cpp | 23 +++++++++-- lld/test/COFF/arm64ec-import.test | 68 +++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 lld/test/COFF/arm64ec-import.test diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 50bc62312a6f8..fa2d230075d9d 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -25,6 +25,7 @@ #include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/PDB/Native/NativeSession.h" #include "llvm/DebugInfo/PDB/Native/PDBFile.h" +#include "llvm/IR/Mangler.h" #include "llvm/LTO/LTO.h" #include "llvm/Object/Binary.h" #include "llvm/Object/COFF.h" @@ -1019,9 +1020,17 @@ void ImportFile::parse() { // Read names and create an __imp_ symbol. StringRef buf = mb.getBuffer().substr(sizeof(*hdr)); - StringRef name = saver().save(buf.split('\0').first); + auto split = buf.split('\0'); + buf = split.second; + StringRef name; + if (isArm64EC(hdr->Machine)) { + if (std::optional demangledName = + getArm64ECDemangledFunctionName(split.first)) + name = saver().save(*demangledName); + } + if (name.empty()) + name = saver().save(split.first); StringRef impName = saver().save("__imp_" + name); - buf = buf.substr(name.size() + 1); dllName = buf.split('\0').first; StringRef extName; switch (hdr->getNameType()) { @@ -1058,8 +1067,14 @@ void ImportFile::parse() { // If type is function, we need to create a thunk which jump to an // address pointed by the __imp_ symbol. (This allows you to call // DLL functions just like regular non-DLL functions.) - if (hdr->getType() == llvm::COFF::IMPORT_CODE) - thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine); + if (hdr->getType() == llvm::COFF::IMPORT_CODE) { + if (ctx.config.machine != ARM64EC) { + thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine); + } else { + thunkSym = ctx.symtab.addImportThunk(name, impSym, AMD64); + // FIXME: Add aux IAT symbols. + } + } } BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test new file mode 100644 index 0000000000000..2a80a30910b5c --- /dev/null +++ b/lld/test/COFF/arm64ec-import.test @@ -0,0 +1,68 @@ +REQUIRES: aarch64, x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj +RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib +RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib +RUN: llvm-lib -machine:x64 -def:test.def -out:test-x86_64.lib + +Link using ARM64EC import library: +RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj \ +RUN: test.obj test-arm64ec.lib test2-arm64ec.lib + +Link using x86_64 import library: +RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj \ +RUN: test.obj test-x86_64.lib test2-arm64ec.lib + +RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s +RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s +IMPORTS: Import { +IMPORTS-NEXT: Name: test.dll +IMPORTS-NEXT: ImportLookupTableRVA: +IMPORTS-NEXT: ImportAddressTableRVA: 0x2258 +IMPORTS-NEXT: Symbol: data (0) +IMPORTS-NEXT: Symbol: func (0) +IMPORTS-NEXT: Symbol: func2 (0) +IMPORTS-NEXT: } +IMPORTS-NEXT: Import { +IMPORTS-NEXT: Name: test2.dll +IMPORTS-NEXT: ImportLookupTableRVA: +IMPORTS-NEXT: ImportAddressTableRVA: 0x2278 +IMPORTS-NEXT: Symbol: t2func (0) +IMPORTS-NEXT: } + +RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s +RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s + +DISASM: 0000000180001000 <.text>: +DISASM-NEXT: 180001000: ff 25 5a 12 00 00 jmpq *0x125a(%rip) # 0x180002260 + +RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s +RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s +TESTSEC: 0x180004000 60220000 58220000 68220000 78220000 +TESTSEC-NEXT: 0x180004010 00100000 + +#--- test.s + .section .test, "r" + .globl arm64ec_data_sym + .p2align 2, 0x0 +arm64ec_data_sym: + .rva __imp_func + .rva __imp_data + .rva __imp_func2 + .rva __imp_t2func + .rva func + +#--- test.def +NAME test.dll +EXPORTS + data DATA + func + func2 + unused_func + +#--- test2.def +NAME test2.dll +EXPORTS + t2func From f11915153761e0c2691945add795c891e63c0c4a Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 4 Sep 2024 14:09:04 +0100 Subject: [PATCH 093/425] IVDescriptors: improve readability of a function (NFC) (#106219) Avoid dereferencing operand to llvm::isa. --- llvm/lib/Analysis/IVDescriptors.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 560955dede84f..fdf78b9f8a44e 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -734,13 +734,12 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) { Value *FalseVal = SI->getFalseValue(); // Handle only when either of operands of select instruction is a PHI // node for now. - if ((isa(*TrueVal) && isa(*FalseVal)) || - (!isa(*TrueVal) && !isa(*FalseVal))) + if ((isa(TrueVal) && isa(FalseVal)) || + (!isa(TrueVal) && !isa(FalseVal))) return InstDesc(false, I); - Instruction *I1 = - isa(*TrueVal) ? dyn_cast(FalseVal) - : dyn_cast(TrueVal); + Instruction *I1 = isa(TrueVal) ? dyn_cast(FalseVal) + : dyn_cast(TrueVal); if (!I1 || !I1->isBinaryOp()) return InstDesc(false, I); @@ -754,8 +753,8 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) { (m_Mul(m_Value(Op1), m_Value(Op2)).match(I1)))) return InstDesc(false, I); - Instruction *IPhi = isa(*Op1) ? dyn_cast(Op1) - : dyn_cast(Op2); + Instruction *IPhi = isa(Op1) ? dyn_cast(Op1) + : dyn_cast(Op2); if (!IPhi || IPhi != FalseVal) return InstDesc(false, I); From 4552153c37e04def01e99e32c02eab245d92b753 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Wed, 4 Sep 2024 18:43:09 +0530 Subject: [PATCH 094/425] [CodeGen][MachineCSE] Remove unused AA results(NFC) (#106604) Alias Analysis result is never used in this pass and hence removing it. --- llvm/lib/CodeGen/MachineCSE.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index aadc54b495fe2..2ac1fae9ea48c 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -73,7 +72,6 @@ namespace { class MachineCSE : public MachineFunctionPass { const TargetInstrInfo *TII = nullptr; const TargetRegisterInfo *TRI = nullptr; - AliasAnalysis *AA = nullptr; MachineDominatorTree *DT = nullptr; MachineRegisterInfo *MRI = nullptr; MachineBlockFrequencyInfo *MBFI = nullptr; @@ -90,7 +88,6 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); AU.addPreservedID(MachineLoopInfoID); AU.addRequired(); AU.addPreserved(); @@ -167,7 +164,6 @@ char &llvm::MachineCSEID = MachineCSE::ID; INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) @@ -943,7 +939,6 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); - AA = &getAnalysis().getAAResults(); DT = &getAnalysis().getDomTree(); MBFI = &getAnalysis().getMBFI(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); From 028174aa2c3a9447aca3333e45b5f89e652b74d1 Mon Sep 17 00:00:00 2001 From: Paul T Robinson Date: Wed, 4 Sep 2024 06:19:41 -0700 Subject: [PATCH 095/425] [DebugInfo] Make a test more robust (#106463) This was accidentally matching a metadata record that happend to have three elements, but wasn't the record of interest. Add CHECKs to make sure we've found the correct record. --- clang/test/CodeGenCXX/debug-info-lambda-this.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGenCXX/debug-info-lambda-this.cpp b/clang/test/CodeGenCXX/debug-info-lambda-this.cpp index 0a2f08ea4aa6d..e5acab126d72c 100644 --- a/clang/test/CodeGenCXX/debug-info-lambda-this.cpp +++ b/clang/test/CodeGenCXX/debug-info-lambda-this.cpp @@ -21,7 +21,8 @@ int main() { return 0; } -// CHECK: !{![[FOO_THIS:[0-9]+]], ![[FOO_AA:[0-9]+]], ![[FOO_OPERATOR:[0-9]+]]} +// CHECK: distinct !DICompositeType(tag: DW_TAG_class_type, name: "", {{.*}}, elements: ![[ELEMENT_TAG:[0-9]+]] +// CHECK: ![[ELEMENT_TAG]] = !{![[FOO_THIS:[0-9]+]], ![[FOO_AA:[0-9]+]], ![[FOO_OPERATOR:[0-9]+]]} // CHECK-NEXT: ![[FOO_THIS]] = !DIDerivedType(tag: DW_TAG_member, name: "__this", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: [[#]]) // CHECK-NEXT: ![[FOO_AA]] = !DIDerivedType(tag: DW_TAG_member, name: "aa", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: [[#]], offset: [[#]]) // CHECK-NEXT: ![[FOO_OPERATOR]] = !DISubprogram(name: "operator()", scope: ![[#]], file: ![[#]], line: [[#]], type: ![[#]], scopeLine: [[#]], flags: DIFlagPublic | DIFlagPrototyped, spFlags: 0) From 6c143a86cddbc6d0431dd643bfc7d4f017042512 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Wed, 4 Sep 2024 18:54:07 +0530 Subject: [PATCH 096/425] [CodeGen][NewPM] Port MachineCSE pass to new pass manager. (#106605) --- llvm/include/llvm/CodeGen/MachineCSE.h | 29 ++ llvm/include/llvm/CodeGen/Passes.h | 2 +- llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/MachineCSE.cpp | 284 ++++++++++-------- llvm/lib/CodeGen/TargetPassConfig.cpp | 6 +- llvm/lib/Passes/PassBuilder.cpp | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +- .../GlobalISel/machine-cse-mid-pipeline.mir | 1 + .../AArch64/sve-pfalse-machine-cse.mir | 1 + .../no-cse-nonlocal-convergent-instrs.mir | 1 + .../copyprop_regsequence_with_undef.mir | 1 + llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir | 8 +- .../CodeGen/PowerPC/machine-cse-rm-pre.mir | 1 + .../CodeGen/Thumb/machine-cse-deadreg.mir | 1 + .../CodeGen/Thumb/machine-cse-physreg.mir | 1 + llvm/test/CodeGen/X86/cse-two-preds.mir | 1 + llvm/test/DebugInfo/MIR/X86/machine-cse.mir | 1 + 21 files changed, 216 insertions(+), 134 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/MachineCSE.h diff --git a/llvm/include/llvm/CodeGen/MachineCSE.h b/llvm/include/llvm/CodeGen/MachineCSE.h new file mode 100644 index 0000000000000..f83c25bf39120 --- /dev/null +++ b/llvm/include/llvm/CodeGen/MachineCSE.h @@ -0,0 +1,29 @@ +//===- llvm/CodeGen/MachineCSE.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINECSE_H +#define LLVM_CODEGEN_MACHINECSE_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class MachineCSEPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + + MachineFunctionProperties getRequiredProperties() { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_MACHINECSE_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index dbdd110b0600e..ddb2012cd2bff 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -330,7 +330,7 @@ namespace llvm { extern char &GCMachineCodeAnalysisID; /// MachineCSE - This pass performs global CSE on machine instructions. - extern char &MachineCSEID; + extern char &MachineCSELegacyID; /// MIRCanonicalizer - This pass canonicalizes MIR by renaming vregs /// according to the semantics of the instruction as well as hoists diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 47a1ca15fc0d1..6605c6fde9251 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -188,7 +188,7 @@ void initializeMachineBlockPlacementPass(PassRegistry &); void initializeMachineBlockPlacementStatsPass(PassRegistry &); void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &); void initializeMachineCFGPrinterPass(PassRegistry &); -void initializeMachineCSEPass(PassRegistry &); +void initializeMachineCSELegacyPass(PassRegistry &); void initializeMachineCombinerPass(PassRegistry &); void initializeMachineCopyPropagationPass(PassRegistry &); void initializeMachineCycleInfoPrinterPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 602e64f23ed51..a99fed86d168d 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -42,6 +42,7 @@ #include "llvm/CodeGen/LocalStackSlotAllocation.h" #include "llvm/CodeGen/LowerEmuTLS.h" #include "llvm/CodeGen/MIRPrinter.h" +#include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 0423e20f58d30..4047fd0478579 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -131,6 +131,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) #endif MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass()) MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) +MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass()) @@ -219,7 +220,6 @@ DUMMY_MACHINE_FUNCTION_PASS("livedebugvalues", LiveDebugValuesPass) DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass) DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass) -DUMMY_MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass) DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass) DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 177702054a0e3..16b8d456748fa 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -75,7 +75,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCFGPrinterPass(Registry); - initializeMachineCSEPass(Registry); + initializeMachineCSELegacyPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); initializeMachineCycleInfoPrinterPassPass(Registry); diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 2ac1fae9ea48c..8e9fcccff7764 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineCSE.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/SmallPtrSet.h" @@ -25,6 +26,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -69,110 +71,110 @@ static cl::opt AggressiveMachineCSE( namespace { - class MachineCSE : public MachineFunctionPass { - const TargetInstrInfo *TII = nullptr; - const TargetRegisterInfo *TRI = nullptr; - MachineDominatorTree *DT = nullptr; - MachineRegisterInfo *MRI = nullptr; - MachineBlockFrequencyInfo *MBFI = nullptr; - - public: - static char ID; // Pass identification - - MachineCSE() : MachineFunctionPass(ID) { - initializeMachineCSEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - AU.addPreservedID(MachineLoopInfoID); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - } - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties() - .set(MachineFunctionProperties::Property::IsSSA); - } +class MachineCSEImpl { + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineDominatorTree *DT = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + +public: + MachineCSEImpl(MachineDominatorTree *DT, MachineBlockFrequencyInfo *MBFI) + : DT(DT), MBFI(MBFI) {} + bool run(MachineFunction &MF); + +private: + using AllocatorTy = + RecyclingAllocator>; + using ScopedHTType = + ScopedHashTable; + using ScopeType = ScopedHTType::ScopeTy; + using PhysDefVector = SmallVector, 2>; + + unsigned LookAheadLimit = 0; + DenseMap ScopeMap; + DenseMap + PREMap; + ScopedHTType VNT; + SmallVector Exps; + unsigned CurrVN = 0; + + bool PerformTrivialCopyPropagation(MachineInstr *MI, MachineBasicBlock *MBB); + bool isPhysDefTriviallyDead(MCRegister Reg, + MachineBasicBlock::const_iterator I, + MachineBasicBlock::const_iterator E) const; + bool hasLivePhysRegDefUses(const MachineInstr *MI, + const MachineBasicBlock *MBB, + SmallSet &PhysRefs, + PhysDefVector &PhysDefs, bool &PhysUseDef) const; + bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, + SmallSet &PhysRefs, + PhysDefVector &PhysDefs, bool &NonLocal) const; + bool isCSECandidate(MachineInstr *MI); + bool isProfitableToCSE(Register CSReg, Register Reg, MachineBasicBlock *CSBB, + MachineInstr *MI); + void EnterScope(MachineBasicBlock *MBB); + void ExitScope(MachineBasicBlock *MBB); + bool ProcessBlockCSE(MachineBasicBlock *MBB); + void ExitScopeIfDone(MachineDomTreeNode *Node, + DenseMap &OpenChildren); + bool PerformCSE(MachineDomTreeNode *Node); + + bool isPRECandidate(MachineInstr *MI, SmallSet &PhysRefs); + bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB); + bool PerformSimplePRE(MachineDominatorTree *DT); + /// Heuristics to see if it's profitable to move common computations of MBB + /// and MBB1 to CandidateBB. + bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB, + MachineBasicBlock *MBB, MachineBasicBlock *MBB1); + void releaseMemory(); +}; + +class MachineCSELegacy : public MachineFunctionPass { +public: + static char ID; // Pass identification + + MachineCSELegacy() : MachineFunctionPass(ID) { + initializeMachineCSELegacyPass(*PassRegistry::getPassRegistry()); + } - void releaseMemory() override { - ScopeMap.clear(); - PREMap.clear(); - Exps.clear(); - } + bool runOnMachineFunction(MachineFunction &MF) override; - private: - using AllocatorTy = RecyclingAllocator>; - using ScopedHTType = - ScopedHashTable; - using ScopeType = ScopedHTType::ScopeTy; - using PhysDefVector = SmallVector, 2>; - - unsigned LookAheadLimit = 0; - DenseMap ScopeMap; - DenseMap - PREMap; - ScopedHTType VNT; - SmallVector Exps; - unsigned CurrVN = 0; - - bool PerformTrivialCopyPropagation(MachineInstr *MI, - MachineBasicBlock *MBB); - bool isPhysDefTriviallyDead(MCRegister Reg, - MachineBasicBlock::const_iterator I, - MachineBasicBlock::const_iterator E) const; - bool hasLivePhysRegDefUses(const MachineInstr *MI, - const MachineBasicBlock *MBB, - SmallSet &PhysRefs, - PhysDefVector &PhysDefs, bool &PhysUseDef) const; - bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, - SmallSet &PhysRefs, - PhysDefVector &PhysDefs, bool &NonLocal) const; - bool isCSECandidate(MachineInstr *MI); - bool isProfitableToCSE(Register CSReg, Register Reg, - MachineBasicBlock *CSBB, MachineInstr *MI); - void EnterScope(MachineBasicBlock *MBB); - void ExitScope(MachineBasicBlock *MBB); - bool ProcessBlockCSE(MachineBasicBlock *MBB); - void ExitScopeIfDone(MachineDomTreeNode *Node, - DenseMap &OpenChildren); - bool PerformCSE(MachineDomTreeNode *Node); - - bool isPRECandidate(MachineInstr *MI, SmallSet &PhysRefs); - bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB); - bool PerformSimplePRE(MachineDominatorTree *DT); - /// Heuristics to see if it's profitable to move common computations of MBB - /// and MBB1 to CandidateBB. - bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB, - MachineBasicBlock *MBB, - MachineBasicBlock *MBB1); - }; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addPreservedID(MachineLoopInfoID); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + } + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; } // end anonymous namespace -char MachineCSE::ID = 0; +char MachineCSELegacy::ID = 0; -char &llvm::MachineCSEID = MachineCSE::ID; +char &llvm::MachineCSELegacyID = MachineCSELegacy::ID; -INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(MachineCSELegacy, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) -INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, +INITIALIZE_PASS_END(MachineCSELegacy, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) /// The source register of a COPY machine instruction can be propagated to all /// its users, and this propagation could increase the probability of finding /// common subexpressions. If the COPY has only one user, the COPY itself can /// be removed. -bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, - MachineBasicBlock *MBB) { +bool MachineCSEImpl::PerformTrivialCopyPropagation(MachineInstr *MI, + MachineBasicBlock *MBB) { bool Changed = false; for (MachineOperand &MO : MI->all_uses()) { Register Reg = MO.getReg(); @@ -225,7 +227,7 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, return Changed; } -bool MachineCSE::isPhysDefTriviallyDead( +bool MachineCSEImpl::isPhysDefTriviallyDead( MCRegister Reg, MachineBasicBlock::const_iterator I, MachineBasicBlock::const_iterator E) const { unsigned LookAheadLeft = LookAheadLimit; @@ -282,11 +284,11 @@ static bool isCallerPreservedOrConstPhysReg(MCRegister Reg, /// physical registers (except for dead defs of physical registers). It also /// returns the physical register def by reference if it's the only one and the /// instruction does not uses a physical register. -bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, - const MachineBasicBlock *MBB, - SmallSet &PhysRefs, - PhysDefVector &PhysDefs, - bool &PhysUseDef) const { +bool MachineCSEImpl::hasLivePhysRegDefUses(const MachineInstr *MI, + const MachineBasicBlock *MBB, + SmallSet &PhysRefs, + PhysDefVector &PhysDefs, + bool &PhysUseDef) const { // First, add all uses to PhysRefs. for (const MachineOperand &MO : MI->all_uses()) { Register Reg = MO.getReg(); @@ -333,10 +335,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, return !PhysRefs.empty(); } -bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, - SmallSet &PhysRefs, - PhysDefVector &PhysDefs, - bool &NonLocal) const { +bool MachineCSEImpl::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, + SmallSet &PhysRefs, + PhysDefVector &PhysDefs, + bool &NonLocal) const { // For now conservatively returns false if the common subexpression is // not in the same basic block as the given instruction. The only exception // is if the common subexpression is in the sole predecessor block. @@ -400,7 +402,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, return false; } -bool MachineCSE::isCSECandidate(MachineInstr *MI) { +bool MachineCSEImpl::isCSECandidate(MachineInstr *MI) { if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo() || MI->isFakeUse()) @@ -437,8 +439,9 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) { /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a /// common expression that defines Reg. CSBB is basic block where CSReg is /// defined. -bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg, - MachineBasicBlock *CSBB, MachineInstr *MI) { +bool MachineCSEImpl::isProfitableToCSE(Register CSReg, Register Reg, + MachineBasicBlock *CSBB, + MachineInstr *MI) { if (AggressiveMachineCSE) return true; @@ -513,13 +516,13 @@ bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg, return !HasPHI; } -void MachineCSE::EnterScope(MachineBasicBlock *MBB) { +void MachineCSEImpl::EnterScope(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n'); ScopeType *Scope = new ScopeType(VNT); ScopeMap[MBB] = Scope; } -void MachineCSE::ExitScope(MachineBasicBlock *MBB) { +void MachineCSEImpl::ExitScope(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n'); DenseMap::iterator SI = ScopeMap.find(MBB); assert(SI != ScopeMap.end()); @@ -527,7 +530,7 @@ void MachineCSE::ExitScope(MachineBasicBlock *MBB) { ScopeMap.erase(SI); } -bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) { +bool MachineCSEImpl::ProcessBlockCSE(MachineBasicBlock *MBB) { bool Changed = false; SmallVector, 8> CSEPairs; @@ -748,9 +751,9 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) { /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given /// dominator tree node if its a leaf or all of its children are done. Walk /// up the dominator tree to destroy ancestors which are now done. -void -MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, - DenseMap &OpenChildren) { +void MachineCSEImpl::ExitScopeIfDone( + MachineDomTreeNode *Node, + DenseMap &OpenChildren) { if (OpenChildren[Node]) return; @@ -767,7 +770,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, } } -bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { +bool MachineCSEImpl::PerformCSE(MachineDomTreeNode *Node) { SmallVector Scopes; SmallVector WorkList; DenseMap OpenChildren; @@ -799,8 +802,8 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { // We use stronger checks for PRE candidate rather than for CSE ones to embrace // checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps // to exclude instrs created by PRE that won't be CSEed later. -bool MachineCSE::isPRECandidate(MachineInstr *MI, - SmallSet &PhysRefs) { +bool MachineCSEImpl::isPRECandidate(MachineInstr *MI, + SmallSet &PhysRefs) { if (!isCSECandidate(MI) || MI->isNotDuplicable() || MI->mayLoad() || @@ -821,8 +824,8 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI, return true; } -bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT, - MachineBasicBlock *MBB) { +bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT, + MachineBasicBlock *MBB) { bool Changed = false; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { SmallSet PhysRefs; @@ -902,7 +905,7 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT, // anticipating that the next CSE step will eliminate this created redundancy. // If CSE doesn't eliminate this, than created instruction will remain dead // and eliminated later by Remove Dead Machine Instructions pass. -bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) { +bool MachineCSEImpl::PerformSimplePRE(MachineDominatorTree *DT) { SmallVector BBs; PREMap.clear(); @@ -920,9 +923,9 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) { return Changed; } -bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB, - MachineBasicBlock *MBB, - MachineBasicBlock *MBB1) { +bool MachineCSEImpl::isProfitableToHoistInto(MachineBasicBlock *CandidateBB, + MachineBasicBlock *MBB, + MachineBasicBlock *MBB1) { if (CandidateBB->getParent()->getFunction().hasMinSize()) return true; assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB"); @@ -932,18 +935,55 @@ bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB, MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1); } -bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; +void MachineCSEImpl::releaseMemory() { + ScopeMap.clear(); + PREMap.clear(); + Exps.clear(); +} +bool MachineCSEImpl::run(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); - DT = &getAnalysis().getDomTree(); - MBFI = &getAnalysis().getMBFI(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); bool ChangedPRE, ChangedCSE; ChangedPRE = PerformSimplePRE(DT); ChangedCSE = PerformCSE(DT->getRootNode()); + releaseMemory(); return ChangedPRE || ChangedCSE; } + +PreservedAnalyses MachineCSEPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + MFPropsModifier _(*this, MF); + + if (MF.getFunction().hasOptNone()) + return PreservedAnalyses::all(); + + MachineDominatorTree &MDT = MFAM.getResult(MF); + MachineBlockFrequencyInfo &MBFI = + MFAM.getResult(MF); + MachineCSEImpl Impl(&MDT, &MBFI); + bool Changed = Impl.run(MF); + if (!Changed) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserveSet(); + return PA; +} + +bool MachineCSELegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineDominatorTree &MDT = + getAnalysis().getDomTree(); + MachineBlockFrequencyInfo &MBFI = + getAnalysis().getMBFI(); + MachineCSEImpl Impl(&MDT, &MBFI); + return Impl.run(MF); +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index c0b834650d73b..11a7752ef7a38 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -311,7 +311,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID, if (StandardID == &EarlyMachineLICMID) return applyDisable(TargetID, DisableMachineLICM); - if (StandardID == &MachineCSEID) + if (StandardID == &MachineCSELegacyID) return applyDisable(TargetID, DisableMachineCSE); if (StandardID == &MachineLICMID) @@ -523,7 +523,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC, DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass) DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass) DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass) - DISABLE_PASS(DisableMachineCSE, MachineCSEPass) + DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass) DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass) DISABLE_PASS(DisableMachineLICM, EarlyMachineLICMPass) DISABLE_PASS(DisableMachineSink, MachineSinkingPass) @@ -1305,7 +1305,7 @@ void TargetPassConfig::addMachineSSAOptimization() { addILPOpts(); addPass(&EarlyMachineLICMID); - addPass(&MachineCSEID); + addPass(&MachineCSELegacyID); addPass(&MachineSinkingID); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 1df1449fce597..a22abed8051a1 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -100,6 +100,7 @@ #include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 18e466d2bd5b5..9c9c505139373 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1277,7 +1277,7 @@ void GCNPassConfig::addMachineSSAOptimization() { if (isPassEnabled(EnableSDWAPeephole)) { addPass(&SIPeepholeSDWAID); addPass(&EarlyMachineLICMID); - addPass(&MachineCSEID); + addPass(&MachineCSELegacyID); addPass(&SIFoldOperandsLegacyID); } addPass(&DeadMachineInstructionElimID); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index e86d3771bd2f2..57b7fa783c14a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -462,7 +462,7 @@ void NVPTXPassConfig::addMachineSSAOptimization() { printAndVerify("After ILP optimizations"); addPass(&EarlyMachineLICMID); - addPass(&MachineCSEID); + addPass(&MachineCSELegacyID); addPass(&MachineSinkingID); printAndVerify("After Machine LICM, CSE and Sinking passes"); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir b/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir index 2b69c13174f6c..015ce5ec2dca6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir @@ -1,4 +1,5 @@ # RUN: llc -run-pass machine-cse -verify-machineinstrs -mtriple aarch64-apple-ios %s -o - | FileCheck %s +# RUN: llc -passes machine-cse -mtriple aarch64-apple-ios %s -o - | FileCheck %s --- name: irtranslated legalized: false diff --git a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir index 8395a7619fbb4..5ebc2f61eaafb 100644 --- a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir +++ b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass=machine-cse -mtriple=aarch64 -mattr=+sve -o - %s | FileCheck %s +# RUN: llc -passes=machine-cse -mtriple=aarch64 -mattr=+sve -o - %s | FileCheck %s --- name: pfalse tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir index 684b5ec3883b2..6eb4df2b48700 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -passes=machine-cse %s | FileCheck %s # LLVM's current definition of `isConvergent` does not necessarily prove that # non-local CSE is illegal. The following test extends the definition of diff --git a/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir b/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir index 1e12a3b22e9a4..fee1391d150f9 100644 --- a/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir +++ b/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -run-pass=machine-cse -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -passes=machine-cse -o - %s | FileCheck %s # Test to ensure that this does not crash on undefs --- diff --git a/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir b/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir index 89b204d715ded..d32737f05a9b0 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir @@ -1,8 +1,10 @@ # REQUIRES: asserts -# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefixes=ERR,ERR-LEGACY %s +# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefixes=ERR,ERR-NPM %s -# ERR: MachineFunctionProperties required by Machine Common Subexpression Elimination pass are not met by function not_ssa. -# ERR-NEXT: Required properties: IsSSA +# ERR-LEGACY: MachineFunctionProperties required by Machine Common Subexpression Elimination pass are not met by function not_ssa. +# ERR-NPM: MachineFunctionProperties required by MachineCSEPass pass are not met by function not_ssa. +# ERR: Required properties: IsSSA # ERR-NEXT: Current properties: NoPHIs --- diff --git a/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir index 32f5e0172047e..0e9459238ce9d 100644 --- a/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir +++ b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc %s -o - -mtriple=powerpc-unknown-unknown -run-pass=machine-cse -verify-machineinstrs | FileCheck %s +# RUN: llc %s -o - -mtriple=powerpc-unknown-unknown -passes=machine-cse | FileCheck %s --- | define void @can_pre() { entry: diff --git a/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir b/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir index e4db7abeea354..cee5c24847f34 100644 --- a/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir +++ b/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple thumbv6m-arm-none-eabi -run-pass=machine-cse -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple thumbv6m-arm-none-eabi -passes=machine-cse -o - %s | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir b/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir index 2fa22feb4e1b3..58e1eca22711a 100644 --- a/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir +++ b/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple thumbv5e -run-pass=machine-cse -o - %s | FileCheck %s +# RUN: llc -mtriple thumbv5e -passes=machine-cse -o - %s | FileCheck %s # This is a contrived example made to expose a bug in # MachineCSE, see PR32538. diff --git a/llvm/test/CodeGen/X86/cse-two-preds.mir b/llvm/test/CodeGen/X86/cse-two-preds.mir index 6479747daf426..e6f04a6ce66d4 100644 --- a/llvm/test/CodeGen/X86/cse-two-preds.mir +++ b/llvm/test/CodeGen/X86/cse-two-preds.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=x86_64 -verify-machineinstrs --run-pass=machine-cse -o - %s | FileCheck %s +# RUN: llc -mtriple=x86_64 -passes=machine-cse -o - %s | FileCheck %s --- | target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir index 120dbdf850cc4..9bcb4408312a6 100644 --- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir +++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir @@ -1,4 +1,5 @@ # RUN: llc %s -o - -run-pass=machine-cse -mtriple=x86_64-- | FileCheck %s +# RUN: llc %s -o - -passes=machine-cse -mtriple=x86_64-- | FileCheck %s # # This test examines machine-cse's behaviour when dealing with copy propagation, # the code for which is lifted from test/CodeGen/X86/machine-cse.ll. There are From 7732d8e51819416b9d28b1815bdf81d0e0642b04 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 4 Sep 2024 06:51:30 -0700 Subject: [PATCH 097/425] [ADT] Deprecate DenseMap::FindAndConstruct (#107224) I've migrated all uses of FindAndConstruct to operator[] and try_emplace. This patch inlines FindAndConstruct into operator[] and deprecates FindAndConstruct. --- llvm/include/llvm/ADT/DenseMap.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index e78700f9a9f3a..745288ff047f4 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -354,7 +354,8 @@ class DenseMapBase : public DebugEpochBase { incrementNumTombstones(); } - value_type& FindAndConstruct(const KeyT &Key) { + LLVM_DEPRECATED("Use [Key] instead", "[Key]") + value_type &FindAndConstruct(const KeyT &Key) { BucketT *TheBucket; if (LookupBucketFor(Key, TheBucket)) return *TheBucket; @@ -363,10 +364,15 @@ class DenseMapBase : public DebugEpochBase { } ValueT &operator[](const KeyT &Key) { - return FindAndConstruct(Key).second; + BucketT *TheBucket; + if (LookupBucketFor(Key, TheBucket)) + return TheBucket->second; + + return InsertIntoBucket(TheBucket, Key)->second; } - value_type& FindAndConstruct(KeyT &&Key) { + LLVM_DEPRECATED("Use [Key] instead", "[Key]") + value_type &FindAndConstruct(KeyT &&Key) { BucketT *TheBucket; if (LookupBucketFor(Key, TheBucket)) return *TheBucket; @@ -375,7 +381,11 @@ class DenseMapBase : public DebugEpochBase { } ValueT &operator[](KeyT &&Key) { - return FindAndConstruct(std::move(Key)).second; + BucketT *TheBucket; + if (LookupBucketFor(Key, TheBucket)) + return TheBucket->second; + + return InsertIntoBucket(TheBucket, std::move(Key))->second; } /// isPointerIntoBucketsArray - Return true if the specified pointer points From 1b0a80249399dadfe0c3f682fff77bf9eb666535 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Wed, 4 Sep 2024 07:02:55 -0700 Subject: [PATCH 098/425] [GDBRemote] Handle 'heap' memory region info type (#105883) This should cause the memory region info "is stack" field to be set to "no". --- .../Process/gdb-remote/GDBRemoteCommunicationClient.cpp | 2 ++ .../gdb-remote/GDBRemoteCommunicationClientTest.cpp | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index 0297fe363f69e..55d76ca8532d3 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -1638,6 +1638,8 @@ Status GDBRemoteCommunicationClient::GetMemoryRegionInfo( for (llvm::StringRef entry : llvm::split(value, ',')) { if (entry == "stack") region_info.SetIsStackMemory(MemoryRegionInfo::eYes); + else if (entry == "heap") + region_info.SetIsStackMemory(MemoryRegionInfo::eNo); } } else if (name == "error") { StringExtractorGDBRemote error_extractor(value); diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp index 18020c8e43fe0..ce5ab2cf50829 100644 --- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp +++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp @@ -364,6 +364,15 @@ TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfo) { EXPECT_TRUE(result.get().Success()); EXPECT_EQ(MemoryRegionInfo::eYes, region_info.GetMemoryTagged()); EXPECT_EQ(MemoryRegionInfo::eYes, region_info.IsStackMemory()); + + result = std::async(std::launch::async, [&] { + return client.GetMemoryRegionInfo(addr, region_info); + }); + + HandlePacket(server, "qMemoryRegionInfo:a000", + "start:a000;size:2000;type:heap;"); + EXPECT_TRUE(result.get().Success()); + EXPECT_EQ(MemoryRegionInfo::eNo, region_info.IsStackMemory()); } TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfoInvalidResponse) { From 30f1cfb4d0784de869ab3a4a9774b696b9769093 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 4 Sep 2024 07:04:32 -0700 Subject: [PATCH 099/425] [TableGen] Print memory stats in detailed record emitter (#106990) Print memory allocation and related statistics when dumping detailed record information. --- llvm/include/llvm/TableGen/Record.h | 4 ++- llvm/lib/TableGen/DetailedRecordsBackend.cpp | 17 ++++++---- llvm/lib/TableGen/Record.cpp | 33 ++++++++++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index a339946e67cf2..ff596df94e4f5 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -1757,7 +1757,7 @@ class Record { ArrayRef getAssertions() const { return Assertions; } ArrayRef getDumps() const { return Dumps; } - ArrayRef> getSuperClasses() const { + ArrayRef> getSuperClasses() const { return SuperClasses; } @@ -2073,6 +2073,8 @@ class RecordKeeper { void dump() const; + void dumpAllocationStats(raw_ostream &OS) const; + private: RecordKeeper(RecordKeeper &&) = delete; RecordKeeper(const RecordKeeper &) = delete; diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp index 45e621483c817..7d17c4d68c3f1 100644 --- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp +++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp @@ -41,6 +41,7 @@ class DetailedRecordsEmitter { void printVariables(raw_ostream &OS); void printClasses(raw_ostream &OS); void printRecords(raw_ostream &OS); + void printAllocationStats(raw_ostream &OS); void printDefms(const Record &Rec, raw_ostream &OS); void printTemplateArgs(const Record &Rec, raw_ostream &OS); void printSuperclasses(const Record &Rec, raw_ostream &OS); @@ -55,6 +56,7 @@ void DetailedRecordsEmitter::run(raw_ostream &OS) { printVariables(OS); printClasses(OS); printRecords(OS); + printAllocationStats(OS); } // Print the report heading, including the source file name. @@ -62,8 +64,7 @@ void DetailedRecordsEmitter::printReportHeading(raw_ostream &OS) { OS << formatv("DETAILED RECORDS for file {0}\n", Records.getInputFilename()); } -// Print a section heading with the name of the section and -// the item count. +// Print a section heading with the name of the section and the item count. void DetailedRecordsEmitter::printSectionHeading(StringRef Title, int Count, raw_ostream &OS) { OS << formatv("\n{0} {1} ({2}) {0}\n", "--------------------", Title, Count); @@ -79,8 +80,7 @@ void DetailedRecordsEmitter::printVariables(raw_ostream &OS) { OS << Var.first << " = " << Var.second->getAsString() << '\n'; } -// Print the classes, including the template arguments, superclasses, -// and fields. +// Print classes, including the template arguments, superclasses, and fields. void DetailedRecordsEmitter::printClasses(raw_ostream &OS) { const auto &ClassList = Records.getClasses(); printSectionHeading("Classes", ClassList.size(), OS); @@ -94,8 +94,7 @@ void DetailedRecordsEmitter::printClasses(raw_ostream &OS) { } } -// Print the records, including the defm sequences, supercasses, -// and fields. +// Print the records, including the defm sequences, supercasses, and fields. void DetailedRecordsEmitter::printRecords(raw_ostream &OS) { const auto &RecordList = Records.getDefs(); printSectionHeading("Records", RecordList.size(), OS); @@ -110,6 +109,12 @@ void DetailedRecordsEmitter::printRecords(raw_ostream &OS) { } } +// Print memory allocation related stats. +void DetailedRecordsEmitter::printAllocationStats(raw_ostream &OS) { + OS << formatv("\n{0} Memory Allocation Stats {0}\n", "--------------------"); + Records.dumpAllocationStats(OS); +} + // Print the record's defm source locations, if any. Note that they // are stored in the reverse order of their invocation. void DetailedRecordsEmitter::printDefms(const Record &Rec, raw_ostream &OS) { diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index bcecee8e550c8..cead8f865a607 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -92,10 +92,39 @@ struct RecordKeeperImpl { unsigned AnonCounter; unsigned LastRecordID; + + void dumpAllocationStats(raw_ostream &OS) const; }; } // namespace detail } // namespace llvm +void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { + // Dump memory allocation related stats. + OS << "TheArgumentInitPool size = " << TheArgumentInitPool.size() << '\n'; + OS << "TheBitsInitPool size = " << TheBitsInitPool.size() << '\n'; + OS << "TheIntInitPool size = " << TheIntInitPool.size() << '\n'; + OS << "TheBitsInitPool size = " << TheBitsInitPool.size() << '\n'; + OS << "TheListInitPool size = " << TheListInitPool.size() << '\n'; + OS << "TheUnOpInitPool size = " << TheUnOpInitPool.size() << '\n'; + OS << "TheBinOpInitPool size = " << TheBinOpInitPool.size() << '\n'; + OS << "TheTernOpInitPool size = " << TheTernOpInitPool.size() << '\n'; + OS << "TheFoldOpInitPool size = " << TheFoldOpInitPool.size() << '\n'; + OS << "TheIsAOpInitPool size = " << TheIsAOpInitPool.size() << '\n'; + OS << "TheExistsOpInitPool size = " << TheExistsOpInitPool.size() << '\n'; + OS << "TheCondOpInitPool size = " << TheCondOpInitPool.size() << '\n'; + OS << "TheDagInitPool size = " << TheDagInitPool.size() << '\n'; + OS << "RecordTypePool size = " << RecordTypePool.size() << '\n'; + OS << "TheVarInitPool size = " << TheVarInitPool.size() << '\n'; + OS << "TheVarBitInitPool size = " << TheVarBitInitPool.size() << '\n'; + OS << "TheVarDefInitPool size = " << TheVarDefInitPool.size() << '\n'; + OS << "TheFieldInitPool size = " << TheFieldInitPool.size() << '\n'; + OS << "Bytes allocated = " << Allocator.getBytesAllocated() << '\n'; + OS << "Total allocator memory = " << Allocator.getTotalMemory() << "\n\n"; + + OS << "Number of records instantiated = " << LastRecordID << '\n'; + OS << "Number of anonymous records = " << AnonCounter << '\n'; +} + //===----------------------------------------------------------------------===// // Type implementations //===----------------------------------------------------------------------===// @@ -3261,6 +3290,10 @@ RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) const { : std::vector(); } +void RecordKeeper::dumpAllocationStats(raw_ostream &OS) const { + Impl->dumpAllocationStats(OS); +} + Init *MapResolver::resolve(Init *VarName) { auto It = Map.find(VarName); if (It == Map.end()) From e64ef634bbd940dfaae23d9fb43e6385014ffd10 Mon Sep 17 00:00:00 2001 From: Malek Ben Slimane <85631834+malek203@users.noreply.github.com> Date: Wed, 4 Sep 2024 16:18:11 +0200 Subject: [PATCH 100/425] Thread Safety Analysis: Differentiate between lock sets at real join points and expected/actual sets at function end (#105526) This fixes false positives related to returning a scoped lockable object. At the end of a function, we check managed locks instead of scoped locks. At real join points, we skip checking managed locks because we assume that the scope keeps track of its underlying mutexes and will release them at its destruction. So, checking for the scopes is sufficient. However, at the end of a function, we aim at comparing the expected and the actual lock sets. There, we skip checking scoped locks to prevent to get duplicate warnings for the same lock. --- clang/lib/Analysis/ThreadSafety.cpp | 8 ++++++-- .../SemaCXX/warn-thread-safety-analysis.cpp | 20 ++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp index e25b843c9bf83..c4a83b069e079 100644 --- a/clang/lib/Analysis/ThreadSafety.cpp +++ b/clang/lib/Analysis/ThreadSafety.cpp @@ -922,6 +922,9 @@ class ScopedLockableFactEntry : public FactEntry { handleRemovalFromIntersection(const FactSet &FSet, FactManager &FactMan, SourceLocation JoinLoc, LockErrorKind LEK, ThreadSafetyHandler &Handler) const override { + if (LEK == LEK_LockedAtEndOfFunction || LEK == LEK_NotLockedAtEndOfFunction) + return; + for (const auto &UnderlyingMutex : UnderlyingMutexes) { const auto *Entry = FSet.findLock(FactMan, UnderlyingMutex.Cap); if ((UnderlyingMutex.Kind == UCK_Acquired && Entry) || @@ -2224,7 +2227,7 @@ void ThreadSafetyAnalyzer::intersectAndWarn(FactSet &EntrySet, if (join(FactMan[*EntryIt], ExitFact, EntryLEK != LEK_LockedSomeLoopIterations)) *EntryIt = Fact; - } else if (!ExitFact.managed()) { + } else if (!ExitFact.managed() || EntryLEK == LEK_LockedAtEndOfFunction) { ExitFact.handleRemovalFromIntersection(ExitSet, FactMan, JoinLoc, EntryLEK, Handler); } @@ -2236,7 +2239,8 @@ void ThreadSafetyAnalyzer::intersectAndWarn(FactSet &EntrySet, const FactEntry *ExitFact = ExitSet.findLock(FactMan, *EntryFact); if (!ExitFact) { - if (!EntryFact->managed() || ExitLEK == LEK_LockedSomeLoopIterations) + if (!EntryFact->managed() || ExitLEK == LEK_LockedSomeLoopIterations || + ExitLEK == LEK_NotLockedAtEndOfFunction) EntryFact->handleRemovalFromIntersection(EntrySetOrig, FactMan, JoinLoc, ExitLEK, Handler); if (ExitLEK == LEK_LockedSomePredecessors) diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp index af9254508d805..8477200456d98 100644 --- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp +++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp @@ -6077,24 +6077,20 @@ namespace ReturnScopedLockable { class Object { public: MutexLock lock() EXCLUSIVE_LOCK_FUNCTION(mutex) { - // TODO: False positive because scoped lock isn't destructed. - return MutexLock(&mutex); // expected-note {{mutex acquired here}} - } // expected-warning {{mutex 'mutex' is still held at the end of function}} + return MutexLock(&mutex); + } ReaderMutexLock lockShared() SHARED_LOCK_FUNCTION(mutex) { - // TODO: False positive because scoped lock isn't destructed. - return ReaderMutexLock(&mutex); // expected-note {{mutex acquired here}} - } // expected-warning {{mutex 'mutex' is still held at the end of function}} + return ReaderMutexLock(&mutex); + } MutexLock adopt() EXCLUSIVE_LOCKS_REQUIRED(mutex) { - // TODO: False positive because scoped lock isn't destructed. - return MutexLock(&mutex, true); // expected-note {{mutex acquired here}} - } // expected-warning {{mutex 'mutex' is still held at the end of function}} + return MutexLock(&mutex, true); + } ReaderMutexLock adoptShared() SHARED_LOCKS_REQUIRED(mutex) { - // TODO: False positive because scoped lock isn't destructed. - return ReaderMutexLock(&mutex, true); // expected-note {{mutex acquired here}} - } // expected-warning {{mutex 'mutex' is still held at the end of function}} + return ReaderMutexLock(&mutex, true); + } int x GUARDED_BY(mutex); void needsLock() EXCLUSIVE_LOCKS_REQUIRED(mutex); From 8f77d37f256809766fd83a09c6d144b785e9165a Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 4 Sep 2024 07:18:53 -0700 Subject: [PATCH 101/425] [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) Cache negative search result from getStoreMergeCandidates() so that mergeConsecutiveStores() does not iterate quadratically over a potentially long sequence of unmergeable stores. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 83 ++++++++++++------- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6390231341f96..37272a09b336a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -191,6 +191,11 @@ namespace { // AA - Used for DAG load/store alias analysis. AliasAnalysis *AA; + /// This caches all chains that have already been processed in + /// DAGCombiner::getStoreMergeCandidates() and found to have no mergeable + /// stores candidates. + SmallPtrSet ChainsWithoutMergeableStores; + /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { @@ -779,11 +784,10 @@ namespace { bool UseTrunc); /// This is a helper function for mergeConsecutiveStores. Stores that - /// potentially may be merged with St are placed in StoreNodes. RootNode is - /// a chain predecessor to all store candidates. - void getStoreMergeCandidates(StoreSDNode *St, - SmallVectorImpl &StoreNodes, - SDNode *&Root); + /// potentially may be merged with St are placed in StoreNodes. On success, + /// returns a chain predecessor to all store candidates. + SDNode *getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl &StoreNodes); /// Helper function for mergeConsecutiveStores. Checks if candidate stores /// have indirect dependency through their operands. RootNode is the @@ -1785,6 +1789,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) { ++NodesCombined; + // Invalidate cached info. + ChainsWithoutMergeableStores.clear(); + // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist @@ -20514,15 +20521,15 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( return true; } -void DAGCombiner::getStoreMergeCandidates( - StoreSDNode *St, SmallVectorImpl &StoreNodes, - SDNode *&RootNode) { +SDNode * +DAGCombiner::getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. We must have a base and an offset. Do not handle stores to undef // base pointers. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef()) - return; + return nullptr; SDValue Val = peekThroughBitcasts(St->getValue()); StoreSource StoreSrc = getStoreSource(Val); @@ -20538,14 +20545,14 @@ void DAGCombiner::getStoreMergeCandidates( LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) - return; + return nullptr; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) - return; + return nullptr; // The memory operands must not be volatile/indexed/atomic. // TODO: May be able to relax for unordered atomics (see D66309) if (!Ld->isSimple() || Ld->isIndexed()) - return; + return nullptr; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { @@ -20613,6 +20620,27 @@ void DAGCombiner::getStoreMergeCandidates( return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; + // We are looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |-------|-------| + // Load Load Store3 + // | | + // Store1 Store2 + // + // FIXME: We should be able to climb and + // descend TokenFactors to find candidates as well. + + SDNode *RootNode = St->getChain().getNode(); + // Bail out if we already analyzed this root node and found nothing. + if (ChainsWithoutMergeableStores.contains(RootNode)) + return nullptr; + // Check if the pair of StoreNode and the RootNode already bail out many // times which is over the limit in dependence check. auto OverLimitInDependenceCheck = [&](SDNode *StoreNode, @@ -20636,28 +20664,13 @@ void DAGCombiner::getStoreMergeCandidates( } }; - // We looking for a root node which is an ancestor to all mergable - // stores. We search up through a load, to our root and then down - // through all children. For instance we will find Store{1,2,3} if - // St is Store1, Store2. or Store3 where the root is not a load - // which always true for nonvolatile ops. TODO: Expand - // the search to find all valid candidates through multiple layers of loads. - // - // Root - // |-------|-------| - // Load Load Store3 - // | | - // Store1 Store2 - // - // FIXME: We should be able to climb and - // descend TokenFactors to find candidates as well. - - RootNode = St->getChain().getNode(); - unsigned NumNodesExplored = 0; const unsigned MaxSearchNodes = 1024; if (auto *Ldn = dyn_cast(RootNode)) { RootNode = Ldn->getChain().getNode(); + // Bail out if we already analyzed this root node and found nothing. + if (ChainsWithoutMergeableStores.contains(RootNode)) + return nullptr; for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) { if (I.getOperandNo() == 0 && isa(*I)) { // walk down chain @@ -20674,6 +20687,8 @@ void DAGCombiner::getStoreMergeCandidates( I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) TryToAddCandidate(I); } + + return RootNode; } // We need to check that merging these stores does not cause a loop in the @@ -21304,9 +21319,8 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { return false; SmallVector StoreNodes; - SDNode *RootNode; // Find potential store merge candidates by searching through chain sub-DAG - getStoreMergeCandidates(St, StoreNodes, RootNode); + SDNode *RootNode = getStoreMergeCandidates(St, StoreNodes); // Check if there is anything to merge. if (StoreNodes.size() < 2) @@ -21362,6 +21376,11 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { llvm_unreachable("Unhandled store source type"); } } + + // Remember if we failed to optimize, to save compile time. + if (!MadeChange) + ChainsWithoutMergeableStores.insert(RootNode); + return MadeChange; } From 3bc38fb27a12f785d8e78b8d00cbd277464ace92 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 4 Sep 2024 16:22:43 +0200 Subject: [PATCH 102/425] [InstCombine] Generalize and consolidate phi translation check (#106051) The foldOpIntoPhi() transforms requires all operands to be phi-translatable. This can be the case either because they are phi nodes in the same block, or because the operand dominates the block. Currently, most callers of foldOpIntoPhi() satisfy this pre-condition by requiring a constant operand, which trivially dominates everything. Only selects had handling for variable operands. Move this logic into foldOpIntoPhi(), so things are handled correctly if other callers are generalized. Also make the implementation a bit more general by querying the dominator tree. --- .../InstCombine/InstCombineSelect.cpp | 42 +------------------ .../InstCombine/InstructionCombining.cpp | 27 ++++++++++-- .../InstCombine/phi-select-constant.ll | 8 ++-- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index fcd11126073bf..66f7c4592457c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1992,41 +1992,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, return Changed ? &SI : nullptr; } -/// SI is a select whose condition is a PHI node (but the two may be in -/// different blocks). See if the true/false values (V) are live in all of the -/// predecessor blocks of the PHI. For example, cases like this can't be mapped: -/// -/// X = phi [ C1, BB1], [C2, BB2] -/// Y = add -/// Z = select X, Y, 0 -/// -/// because Y is not live in BB1/BB2. -static bool canSelectOperandBeMappingIntoPredBlock(const Value *V, - const SelectInst &SI) { - // If the value is a non-instruction value like a constant or argument, it - // can always be mapped. - const Instruction *I = dyn_cast(V); - if (!I) return true; - - // If V is a PHI node defined in the same block as the condition PHI, we can - // map the arguments. - const PHINode *CondPHI = cast(SI.getCondition()); - - if (const PHINode *VP = dyn_cast(I)) - if (VP->getParent() == CondPHI->getParent()) - return true; - - // Otherwise, if the PHI and select are defined in the same block and if V is - // defined in a different block, then we can transform it. - if (SI.getParent() == CondPHI->getParent() && - I->getParent() != CondPHI->getParent()) - return true; - - // Otherwise we have a 'hard' case and we can't tell without doing more - // detailed dominator based analysis, punt. - return false; -} - /// We have an SPF (e.g. a min or max) of an SPF of the form: /// SPF2(SPF1(A, B), C) Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner, @@ -3929,11 +3894,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { // See if we can fold the select into a phi node if the condition is a select. if (auto *PN = dyn_cast(SI.getCondition())) - // The true/false values have to be live in the PHI predecessor's blocks. - if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) && - canSelectOperandBeMappingIntoPredBlock(FalseVal, SI)) - if (Instruction *NV = foldOpIntoPhi(SI, PN)) - return NV; + if (Instruction *NV = foldOpIntoPhi(SI, PN)) + return NV; if (SelectInst *TrueSI = dyn_cast(TrueVal)) { if (TrueSI->getCondition()->getType() == CondVal->getType()) { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index ad2a620081bcd..8195e0539305c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1730,8 +1730,7 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN, const DataLayout &DL, const SimplifyQuery SQ) { // NB: It is a precondition of this transform that the operands be - // phi translatable! This is usually trivially satisfied by limiting it - // to constant ops, and for selects we do a more sophisticated check. + // phi translatable! SmallVector Ops; for (Value *Op : I.operands()) { if (Op == PN) @@ -1784,9 +1783,31 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { // Otherwise, we can replace *all* users with the new PHI we form. } + // Check that all operands are phi-translatable. + for (Value *Op : I.operands()) { + if (Op == PN) + continue; + + // Non-instructions never require phi-translation. + auto *I = dyn_cast(Op); + if (!I) + continue; + + // Phi-translate can handle phi nodes in the same block. + if (isa(I)) + if (I->getParent() == PN->getParent()) + continue; + + // Operand dominates the block, no phi-translation necessary. + if (DT.dominates(I, PN->getParent())) + continue; + + // Not phi-translatable, bail out. + return nullptr; + } + // Check to see whether the instruction can be folded into each phi operand. // If there is one operand that does not fold, remember the BB it is in. - // If there is more than one or if *it* is a PHI, bail out. SmallVector NewPhiValues; BasicBlock *NonSimplifiedBB = nullptr; Value *NonSimplifiedInVal = nullptr; diff --git a/llvm/test/Transforms/InstCombine/phi-select-constant.ll b/llvm/test/Transforms/InstCombine/phi-select-constant.ll index 9d61891127690..81a27811ec63f 100644 --- a/llvm/test/Transforms/InstCombine/phi-select-constant.ll +++ b/llvm/test/Transforms/InstCombine/phi-select-constant.ll @@ -226,17 +226,17 @@ final: define i32 @dominating_values_select_not_same_block(i1 %c1, i1 %c2, ptr %p, ptr %p2) { ; CHECK-LABEL: @dominating_values_select_not_same_block( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[P2:%.*]], align 4 ; CHECK-NEXT: br i1 [[C1:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]] ; CHECK: delay: +; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[C2:%.*]], i32 [[A]], i32 [[B]] ; CHECK-NEXT: br label [[FINAL]] ; CHECK: final: -; CHECK-NEXT: [[USE2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[C2:%.*]], [[DELAY]] ] +; CHECK-NEXT: [[USE2:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[TMP0]], [[DELAY]] ] ; CHECK-NEXT: br label [[SPLIT:%.*]] ; CHECK: split: -; CHECK-NEXT: [[VALUE:%.*]] = select i1 [[USE2]], i32 [[A]], i32 [[B]] -; CHECK-NEXT: ret i32 [[VALUE]] +; CHECK-NEXT: ret i32 [[USE2]] ; entry: %a = load i32, ptr %p From 75dc9af1a227e5bfd34eeaf822d2db4520545f14 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 4 Sep 2024 05:57:13 -0700 Subject: [PATCH 103/425] [SLP][NFC]Remove some dead code + reorder calls to avoid extra checks --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a2af7f4e1b01c..60476398e5ca7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7126,8 +7126,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't handle vectors. - if (!SLPReVec && getValueType(S.OpValue)->isVectorTy() && - !isa(S.OpValue)) { + if (!SLPReVec && getValueType(S.OpValue)->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; @@ -13221,9 +13220,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } Value *V = E->Scalars.front(); - Type *ScalarTy = getValueType(V); - if (isa(V)) - ScalarTy = V->getType(); + Type *ScalarTy = V->getType(); + if (!isa(V)) + ScalarTy = getValueType(V); auto It = MinBWs.find(E); if (It != MinBWs.end()) { auto *VecTy = dyn_cast(ScalarTy); From f7fa75b20835254c35baeff908b8c3827c13db41 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 4 Sep 2024 15:29:32 +0100 Subject: [PATCH 104/425] [AArch64] Implement intrinsics for SME2 FAMIN/FAMAX (#99063) This patch implements these intrinsics: ``` c // Variants are also available for: // [_f32_x2], [_f64_x2], // [_f16_x4], [_f32_x4], [_f64_x4] svfloat16x2_t svamax[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming; svfloat16x2_t svamin[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming; ``` (cf. https://github.com/ARM-software/acle/pull/324) Co-authored-by: Caroline Concatto --- clang/include/clang/Basic/arm_sve.td | 7 + .../acle_sme2_faminmax.c | 476 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 9 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 28 ++ .../AArch64/sme2-intrinsics-faminmax.ll | 241 +++++++++ 5 files changed, 761 insertions(+) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index a0f12e1bbd2d4..edf73d9022b06 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2253,6 +2253,13 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { def SVSQDMULH_X4 : SInst<"svqdmulh[_{d}_x4]", "444", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx4", [IsStreaming], []>; } +let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,faminmax" in { + def FAMIN_X2 : Inst<"svamin[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sme_famin_x2", [IsStreaming], []>; + def FAMAX_X2 : Inst<"svamax[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sme_famax_x2", [IsStreaming], []>; + def FAMIN_X4 : Inst<"svamin[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sme_famin_x4", [IsStreaming], []>; + def FAMAX_X4 : Inst<"svamax[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sme_famax_x4", [IsStreaming], []>; +} + let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [VerifyRuntimeMode], []>; def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [VerifyRuntimeMode], []>; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c new file mode 100644 index 0000000000000..5d026f8cde5e0 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c @@ -0,0 +1,476 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + + +// Multi, x2 + +// CHECK-LABEL: @test_svamax_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f16_x213svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svamax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f16_x2)(zdn, zm); +} + +// CHECK-LABEL: @test_svamax_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f32_x213svfloat32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svamax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f32_x2)(zdn, zm); +} + +// CHECK-LABEL: @test_svamax_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f64_x213svfloat64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svamax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f64_x2)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f16_x213svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svamin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f16_x2)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f32_x213svfloat32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svamin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f32_x2)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f64_x213svfloat64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sme.famin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svamin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f64_x2)(zdn, zm); +} + +// Multi, x4 + +// CHECK-LABEL: @test_svamax_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f16_x413svfloat16x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svamax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f16_x4)(zdn, zm); +} + +// CHECK-LABEL: @test_svamax_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f32_x413svfloat32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svamax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f32_x4)(zdn, zm); +} + +// CHECK-LABEL: @test_svamax_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamax_f64_x413svfloat64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svamax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamax,_f64_x4)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f16_x413svfloat16x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svamin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f16_x4)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f32_x413svfloat32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svamin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f32_x4)(zdn, zm); +} + +// CHECK-LABEL: @test_svamin_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svamin_f64_x413svfloat64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sme.famin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svamin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svamin,_f64_x4)(zdn, zm); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 9bce850750f79..8ac1d67e162f7 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3521,6 +3521,15 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic; } + // + // Multi-vector floating point absolute min/max number + // + + foreach instr = ["famax", "famin"] in { + def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic; + } + // // Multi-vector vertical dot-products // diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index f5f9a62faa0f5..69806c9c3fdbf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5822,6 +5822,34 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::FMAX_VG4_4Z4Z_S, AArch64::FMAX_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sme_famax_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FAMAX_2Z2Z_H, AArch64::FAMAX_2Z2Z_S, + AArch64::FAMAX_2Z2Z_D})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; + case Intrinsic::aarch64_sme_famax_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FAMAX_4Z4Z_H, AArch64::FAMAX_4Z4Z_S, + AArch64::FAMAX_4Z4Z_D})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; + case Intrinsic::aarch64_sme_famin_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FAMIN_2Z2Z_H, AArch64::FAMIN_2Z2Z_S, + AArch64::FAMIN_2Z2Z_D})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; + case Intrinsic::aarch64_sme_famin_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::FAMIN_4Z4Z_H, AArch64::FAMIN_4Z4Z_S, + AArch64::FAMIN_4Z4Z_D})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_smin_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll new file mode 100644 index 0000000000000..ecfbb47ba5571 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll @@ -0,0 +1,241 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -mattr=+sme2 -mattr=+faminmax -force-streaming -verify-machineinstrs < %s | FileCheck %s + +; FAMAX (Multi, x2) + +define { , } @multi_vec_max_multi_x2_f16( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_max_multi_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famax { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famax.x2.nxv8f16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_max_multi_x2_f32( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_max_multi_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famax { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famax.x2.nxv4f32( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_max_multi_x2_f64( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_max_multi_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famax { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famax.x2.nxv2f64( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; FAMAX (Multi, x4) + +define { , , , }@multi_vec_max_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_max_multi_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.famax.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_max_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_max_multi_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.famax.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_max_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_max_multi_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sme.famax.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + + +; FAMIN (Multi, x2) + +define { , } @multi_vec_min_multi_x2_f16( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_min_multi_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famin { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famin.x2.nxv8f16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_min_multi_x2_f32( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_min_multi_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famin { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famin.x2.nxv4f32( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @multi_vec_main_multi_x2_f64( %unused, %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_main_multi_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: famin { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d } +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.famin.x2.nxv2f64( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; FAMIN (Multi, x4) + +define { , , , } @multi_vec_min_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_min_multi_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sme.famin.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_min_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_min_multi_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sme.famin.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @multi_vec_min_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_min_multi_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: famin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } +; CHECK-NEXT: mov z0.d, z24.d +; CHECK-NEXT: mov z1.d, z25.d +; CHECK-NEXT: mov z2.d, z26.d +; CHECK-NEXT: mov z3.d, z27.d +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sme.famin.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} From 660e34fd38c3fb39fba1871bbf5b2eb3a48bf277 Mon Sep 17 00:00:00 2001 From: Alexey Merzlyakov <60094858+AlexeyMerzlyakov@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:31:59 +0300 Subject: [PATCH 105/425] [lldb][RISCV] Support optionally disabled FPR for riscv64 (#104547) The PR adds the support optionally enabled/disabled FP-registers to LLDB `RegisterInfoPOSIX_riscv64`. This situation might take place for RISC-V builds having no FP-registers, like RV64IMAC or RV64IMACV. To aim this, patch adds `opt_regsets` flags mechanism. It re-works RegisterInfo class to work with flexibly allocated (depending on `opt_regsets` flag) `m_register_sets` and `m_register_infos` vectors instead of statically defined structures. The registration of regsets is being arranged by `m_per_regset_regnum_range` map. The patch flows are spread to `NativeRegisterContextLinux_riscv64` and `RegisterContextCorePOSIX_riscv64` classes, that were tested on: - x86_64 host working with coredumps - RV64GC and RV64IMAC targets working with coredumps and natively in run-time with binaries `EmulateInstructionRISCV` is out of scope of this patch, and its behavior did not change, using maximum set of registers. According testcase built for RV64IMAC (no-FPR) was added to `TestLinuxCore.py`. --- .../RISCV/EmulateInstructionRISCV.cpp | 8 +- .../NativeRegisterContextLinux_riscv64.cpp | 57 +++++--- .../NativeRegisterContextLinux_riscv64.h | 2 + .../Utility/RegisterContextPOSIX_riscv64.cpp | 3 +- .../Utility/RegisterInfoPOSIX_riscv64.cpp | 132 +++++++++-------- .../Utility/RegisterInfoPOSIX_riscv64.h | 38 +++-- .../Process/Utility/RegisterInfos_riscv64.h | 54 +++---- .../RegisterContextPOSIXCore_riscv64.cpp | 25 ++-- .../RegisterContextPOSIXCore_riscv64.h | 3 - .../postmortem/elf-core/TestLinuxCore.py | 133 ++++++++++++++---- ...iscv64.core => linux-riscv64.gpr_fpr.core} | Bin 20480 -> 20480 bytes .../elf-core/linux-riscv64.gpr_fpr.out | Bin 0 -> 3480 bytes .../elf-core/linux-riscv64.gpr_only.core | Bin 0 -> 28672 bytes .../elf-core/linux-riscv64.gpr_only.out | Bin 0 -> 3520 bytes .../postmortem/elf-core/linux-riscv64.out | Bin 3328 -> 0 bytes .../postmortem/elf-core/main_fpr.c | 14 ++ 16 files changed, 300 insertions(+), 169 deletions(-) rename lldb/test/API/functionalities/postmortem/elf-core/{linux-riscv64.core => linux-riscv64.gpr_fpr.core} (76%) create mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.out create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core create mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.out delete mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp index e8014b1eeb378..badc7ba36f011 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp @@ -1748,10 +1748,10 @@ EmulateInstructionRISCV::GetRegisterInfo(RegisterKind reg_kind, } } - const RegisterInfo *array = - RegisterInfoPOSIX_riscv64::GetRegisterInfoPtr(m_arch); - const uint32_t length = - RegisterInfoPOSIX_riscv64::GetRegisterInfoCount(m_arch); + RegisterInfoPOSIX_riscv64 reg_info(m_arch, + RegisterInfoPOSIX_riscv64::eRegsetMaskAll); + const RegisterInfo *array = reg_info.GetRegisterInfo(); + const uint32_t length = reg_info.GetRegisterCount(); if (reg_index >= length || reg_kind != eRegisterKindLLDB) return {}; diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp index bfa1a154b0f28..45b6c8ff9905b 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp @@ -23,12 +23,11 @@ // System includes - They have to be included after framework includes because // they define some macros which collide with variable names in other modules +#include #include // NT_PRSTATUS and NT_FPREGSET definition #include -#define REG_CONTEXT_SIZE (GetGPRSize() + GetFPRSize()) - using namespace lldb; using namespace lldb_private; using namespace lldb_private::process_linux; @@ -38,7 +37,21 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux( const ArchSpec &target_arch, NativeThreadLinux &native_thread) { switch (target_arch.GetMachine()) { case llvm::Triple::riscv64: { - Flags opt_regsets; + Flags opt_regsets(RegisterInfoPOSIX_riscv64::eRegsetMaskDefault); + + RegisterInfoPOSIX_riscv64::FPR fpr; + struct iovec ioVec; + ioVec.iov_base = &fpr; + ioVec.iov_len = sizeof(fpr); + unsigned int regset = NT_FPREGSET; + + if (NativeProcessLinux::PtraceWrapper(PTRACE_GETREGSET, + native_thread.GetID(), ®set, + &ioVec, sizeof(fpr)) + .Success()) { + opt_regsets.Set(RegisterInfoPOSIX_riscv64::eRegsetMaskFP); + } + auto register_info_up = std::make_unique(target_arch, opt_regsets); return std::make_unique( @@ -194,20 +207,23 @@ Status NativeRegisterContextLinux_riscv64::ReadAllRegisterValues( lldb::WritableDataBufferSP &data_sp) { Status error; - data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0)); + data_sp.reset(new DataBufferHeap(GetRegContextSize(), 0)); error = ReadGPR(); if (error.Fail()) return error; - error = ReadFPR(); - if (error.Fail()) - return error; + if (GetRegisterInfo().IsFPPresent()) { + error = ReadFPR(); + if (error.Fail()) + return error; + } uint8_t *dst = const_cast(data_sp->GetBytes()); ::memcpy(dst, GetGPRBuffer(), GetGPRSize()); dst += GetGPRSize(); - ::memcpy(dst, GetFPRBuffer(), GetFPRSize()); + if (GetRegisterInfo().IsFPPresent()) + ::memcpy(dst, GetFPRBuffer(), GetFPRSize()); return error; } @@ -223,11 +239,11 @@ Status NativeRegisterContextLinux_riscv64::WriteAllRegisterValues( return error; } - if (data_sp->GetByteSize() != REG_CONTEXT_SIZE) { + if (data_sp->GetByteSize() != GetRegContextSize()) { error = Status::FromErrorStringWithFormat( "NativeRegisterContextLinux_riscv64::%s data_sp contained mismatched " "data size, expected %" PRIu64 ", actual %" PRIu64, - __FUNCTION__, REG_CONTEXT_SIZE, data_sp->GetByteSize()); + __FUNCTION__, GetRegContextSize(), data_sp->GetByteSize()); return error; } @@ -247,23 +263,32 @@ Status NativeRegisterContextLinux_riscv64::WriteAllRegisterValues( return error; src += GetRegisterInfoInterface().GetGPRSize(); - ::memcpy(GetFPRBuffer(), src, GetFPRSize()); - error = WriteFPR(); - if (error.Fail()) - return error; + if (GetRegisterInfo().IsFPPresent()) { + ::memcpy(GetFPRBuffer(), src, GetFPRSize()); + + error = WriteFPR(); + if (error.Fail()) + return error; + } return error; } +size_t NativeRegisterContextLinux_riscv64::GetRegContextSize() { + size_t size = GetGPRSize(); + if (GetRegisterInfo().IsFPPresent()) + size += GetFPRSize(); + return size; +} + bool NativeRegisterContextLinux_riscv64::IsGPR(unsigned reg) const { return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) == RegisterInfoPOSIX_riscv64::GPRegSet; } bool NativeRegisterContextLinux_riscv64::IsFPR(unsigned reg) const { - return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) == - RegisterInfoPOSIX_riscv64::FPRegSet; + return GetRegisterInfo().IsFPReg(reg); } Status NativeRegisterContextLinux_riscv64::ReadGPR() { diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h index 41b4e2573add9..d5cc50131cdc3 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h @@ -75,6 +75,8 @@ class NativeRegisterContextLinux_riscv64 : public NativeRegisterContextLinux { RegisterInfoPOSIX_riscv64::FPR m_fpr; + size_t GetRegContextSize(); + bool IsGPR(unsigned reg) const; bool IsFPR(unsigned reg) const; diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp index 035ce00e11626..bbcfb9eae1003 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp @@ -77,6 +77,5 @@ bool RegisterContextPOSIX_riscv64::IsGPR(unsigned int reg) { } bool RegisterContextPOSIX_riscv64::IsFPR(unsigned int reg) { - return m_register_info_up->GetRegisterSetFromRegisterIndex(reg) == - RegisterInfoPOSIX_riscv64::FPRegSet; + return m_register_info_up->IsFPReg(reg); } diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp index 3819401c543b1..4a3737795848e 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp @@ -18,42 +18,15 @@ #define GPR_OFFSET(idx) ((idx)*8 + 0) #define FPR_OFFSET(idx) ((idx)*8 + sizeof(RegisterInfoPOSIX_riscv64::GPR)) -#define REG_CONTEXT_SIZE \ - (sizeof(RegisterInfoPOSIX_riscv64::GPR) + \ - sizeof(RegisterInfoPOSIX_riscv64::FPR)) - #define DECLARE_REGISTER_INFOS_RISCV64_STRUCT #include "RegisterInfos_riscv64.h" #undef DECLARE_REGISTER_INFOS_RISCV64_STRUCT -const lldb_private::RegisterInfo *RegisterInfoPOSIX_riscv64::GetRegisterInfoPtr( - const lldb_private::ArchSpec &target_arch) { - switch (target_arch.GetMachine()) { - case llvm::Triple::riscv64: - return g_register_infos_riscv64_le; - default: - assert(false && "Unhandled target architecture."); - return nullptr; - } -} - -uint32_t RegisterInfoPOSIX_riscv64::GetRegisterInfoCount( - const lldb_private::ArchSpec &target_arch) { - switch (target_arch.GetMachine()) { - case llvm::Triple::riscv64: - return static_cast(sizeof(g_register_infos_riscv64_le) / - sizeof(g_register_infos_riscv64_le[0])); - default: - assert(false && "Unhandled target architecture."); - return 0; - } -} - // Number of register sets provided by this context. enum { k_num_gpr_registers = gpr_last_riscv - gpr_first_riscv + 1, k_num_fpr_registers = fpr_last_riscv - fpr_first_riscv + 1, - k_num_register_sets = 2 + k_num_register_sets_default = 1 }; // RISC-V64 general purpose registers. @@ -73,38 +46,69 @@ static_assert(((sizeof g_gpr_regnums_riscv64 / 1) == k_num_gpr_registers, "g_gpr_regnums_riscv64 has wrong number of register infos"); -// RISC-V64 floating point registers. -static const uint32_t g_fpr_regnums_riscv64[] = { - fpr_f0_riscv, fpr_f1_riscv, fpr_f2_riscv, fpr_f3_riscv, - fpr_f4_riscv, fpr_f5_riscv, fpr_f6_riscv, fpr_f7_riscv, - fpr_f8_riscv, fpr_f9_riscv, fpr_f10_riscv, fpr_f11_riscv, - fpr_f12_riscv, fpr_f13_riscv, fpr_f14_riscv, fpr_f15_riscv, - fpr_f16_riscv, fpr_f17_riscv, fpr_f18_riscv, fpr_f19_riscv, - fpr_f20_riscv, fpr_f21_riscv, fpr_f22_riscv, fpr_f23_riscv, - fpr_f24_riscv, fpr_f25_riscv, fpr_f26_riscv, fpr_f27_riscv, - fpr_f28_riscv, fpr_f29_riscv, fpr_f30_riscv, fpr_f31_riscv, - fpr_fcsr_riscv, LLDB_INVALID_REGNUM}; - -static_assert(((sizeof g_fpr_regnums_riscv64 / - sizeof g_fpr_regnums_riscv64[0]) - - 1) == k_num_fpr_registers, - "g_fpr_regnums_riscv64 has wrong number of register infos"); - // Register sets for RISC-V64. -static const lldb_private::RegisterSet g_reg_sets_riscv64[k_num_register_sets] = - {{"General Purpose Registers", "gpr", k_num_gpr_registers, - g_gpr_regnums_riscv64}, - {"Floating Point Registers", "fpr", k_num_fpr_registers, - g_fpr_regnums_riscv64}}; +static const lldb_private::RegisterSet g_reg_set_gpr_riscv64 = { + "General Purpose Registers", "gpr", k_num_gpr_registers, + g_gpr_regnums_riscv64}; +static const lldb_private::RegisterSet g_reg_set_fpr_riscv64 = { + "Floating Point Registers", "fpr", k_num_fpr_registers, nullptr}; RegisterInfoPOSIX_riscv64::RegisterInfoPOSIX_riscv64( - const lldb_private::ArchSpec &target_arch, lldb_private::Flags flags) + const lldb_private::ArchSpec &target_arch, lldb_private::Flags opt_regsets) : lldb_private::RegisterInfoAndSetInterface(target_arch), - m_register_info_p(GetRegisterInfoPtr(target_arch)), - m_register_info_count(GetRegisterInfoCount(target_arch)) {} + m_opt_regsets(opt_regsets) { + switch (target_arch.GetMachine()) { + case llvm::Triple::riscv64: { + // By-default considering RISC-V has only GPR. + // Other register sets could be enabled optionally by opt_regsets. + AddRegSetGP(); + + if (m_opt_regsets.AnySet(eRegsetMaskFP)) + AddRegSetFP(); + + break; + } + default: + assert(false && "Unhandled target architecture."); + } +} + +void RegisterInfoPOSIX_riscv64::AddRegSetGP() { + m_register_infos.resize(k_num_gpr_registers); + memcpy(&m_register_infos[0], g_register_infos_riscv64_gpr, + sizeof(g_register_infos_riscv64_gpr)); + m_register_sets.push_back(g_reg_set_gpr_riscv64); + + m_per_regset_regnum_range[GPRegSet] = + std::make_pair(gpr_first_riscv, m_register_infos.size()); +} + +void RegisterInfoPOSIX_riscv64::AddRegSetFP() { + const uint32_t register_info_count = m_register_infos.size(); + const uint32_t register_set_count = m_register_sets.size(); + + // Filling m_register_infos. + // For FPR case we do not need to correct register offsets and kinds + // while for other further cases (like VPR), register offset/kind + // should be started counting from the last one in previously added + // regset. This is needed for the case e.g. when architecture has GPR + VPR + // sets only. + m_register_infos.resize(register_info_count + k_num_fpr_registers); + memcpy(&m_register_infos[register_info_count], g_register_infos_riscv64_fpr, + sizeof(g_register_infos_riscv64_fpr)); + + // Filling m_register_sets with enabled register set + for (uint32_t i = 0; i < k_num_fpr_registers; i++) + m_fp_regnum_collection.push_back(register_info_count + i); + m_register_sets.push_back(g_reg_set_fpr_riscv64); + m_register_sets.back().registers = m_fp_regnum_collection.data(); + + m_per_regset_regnum_range[register_set_count] = + std::make_pair(register_info_count, m_register_infos.size()); +} uint32_t RegisterInfoPOSIX_riscv64::GetRegisterCount() const { - return m_register_info_count; + return m_register_infos.size(); } size_t RegisterInfoPOSIX_riscv64::GetGPRSize() const { @@ -117,26 +121,30 @@ size_t RegisterInfoPOSIX_riscv64::GetFPRSize() const { const lldb_private::RegisterInfo * RegisterInfoPOSIX_riscv64::GetRegisterInfo() const { - return m_register_info_p; + return m_register_infos.data(); } size_t RegisterInfoPOSIX_riscv64::GetRegisterSetCount() const { - return k_num_register_sets; + return m_register_sets.size(); } size_t RegisterInfoPOSIX_riscv64::GetRegisterSetFromRegisterIndex( uint32_t reg_index) const { - // coverity[unsigned_compare] - if (reg_index >= gpr_first_riscv && reg_index <= gpr_last_riscv) - return GPRegSet; - if (reg_index >= fpr_first_riscv && reg_index <= fpr_last_riscv) - return FPRegSet; + for (const auto ®set_range : m_per_regset_regnum_range) { + if (reg_index >= regset_range.second.first && + reg_index < regset_range.second.second) + return regset_range.first; + } return LLDB_INVALID_REGNUM; } +bool RegisterInfoPOSIX_riscv64::IsFPReg(unsigned reg) const { + return llvm::is_contained(m_fp_regnum_collection, reg); +} + const lldb_private::RegisterSet * RegisterInfoPOSIX_riscv64::GetRegisterSet(size_t set_index) const { if (set_index < GetRegisterSetCount()) - return &g_reg_sets_riscv64[set_index]; + return &m_register_sets[set_index]; return nullptr; } diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h index 4bf4bede01328..f8e22c7df3c88 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h @@ -11,19 +11,21 @@ #include "RegisterInfoAndSetInterface.h" #include "lldb/Target/RegisterContext.h" +#include "lldb/Utility/Flags.h" #include "lldb/lldb-private.h" #include class RegisterInfoPOSIX_riscv64 : public lldb_private::RegisterInfoAndSetInterface { public: - static const lldb_private::RegisterInfo * - GetRegisterInfoPtr(const lldb_private::ArchSpec &target_arch); - static uint32_t - GetRegisterInfoCount(const lldb_private::ArchSpec &target_arch); + enum { GPRegSet = 0 }; -public: - enum { GPRegSet = 0, FPRegSet }; + // RISC-V64 register set mask value + enum { + eRegsetMaskDefault = 0, + eRegsetMaskFP = 1, + eRegsetMaskAll = -1, + }; struct GPR { // note: gpr[0] is pc, not x0 @@ -41,7 +43,11 @@ class RegisterInfoPOSIX_riscv64 }; RegisterInfoPOSIX_riscv64(const lldb_private::ArchSpec &target_arch, - lldb_private::Flags flags); + lldb_private::Flags opt_regsets); + + void AddRegSetGP(); + + void AddRegSetFP(); size_t GetGPRSize() const override; @@ -58,9 +64,23 @@ class RegisterInfoPOSIX_riscv64 size_t GetRegisterSetFromRegisterIndex(uint32_t reg_index) const override; + bool IsFPPresent() const { return m_opt_regsets.AnySet(eRegsetMaskFP); } + + bool IsFPReg(unsigned reg) const; + private: - const lldb_private::RegisterInfo *m_register_info_p; - uint32_t m_register_info_count; + std::vector m_register_infos; + + std::vector m_register_sets; + + // Contains pair of [start, end] register numbers of a register set with start + // and end included. + std::map> m_per_regset_regnum_range; + + // Register collections to be stored as reference for m_register_sets items + std::vector m_fp_regnum_collection; + + lldb_private::Flags m_opt_regsets; }; #endif diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h b/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h index 720d900c7b97e..628ed3770bdc0 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h @@ -79,7 +79,7 @@ using namespace riscv_dwarf; // clang-format on -static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = { +static lldb_private::RegisterInfo g_register_infos_riscv64_gpr[] = { // DEFINE_GPR64(name, GENERIC KIND) DEFINE_GPR64(pc, LLDB_REGNUM_GENERIC_PC), DEFINE_GPR64_ALT(ra, x1, LLDB_REGNUM_GENERIC_RA), @@ -114,7 +114,9 @@ static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = { DEFINE_GPR64_ALT(t5, x30, LLDB_INVALID_REGNUM), DEFINE_GPR64_ALT(t6, x31, LLDB_INVALID_REGNUM), DEFINE_GPR64_ALT(zero, x0, LLDB_INVALID_REGNUM), +}; +static lldb_private::RegisterInfo g_register_infos_riscv64_fpr[] = { DEFINE_FPR64_ALT(ft0, f0, LLDB_INVALID_REGNUM), DEFINE_FPR64_ALT(ft1, f1, LLDB_INVALID_REGNUM), DEFINE_FPR64_ALT(ft2, f2, LLDB_INVALID_REGNUM), @@ -148,39 +150,25 @@ static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = { DEFINE_FPR64_ALT(ft10, f30, LLDB_INVALID_REGNUM), DEFINE_FPR64_ALT(ft11, f31, LLDB_INVALID_REGNUM), DEFINE_FPR_ALT(fcsr, nullptr, 4, LLDB_INVALID_REGNUM), +}; - DEFINE_VPR(v0, LLDB_INVALID_REGNUM), - DEFINE_VPR(v1, LLDB_INVALID_REGNUM), - DEFINE_VPR(v2, LLDB_INVALID_REGNUM), - DEFINE_VPR(v3, LLDB_INVALID_REGNUM), - DEFINE_VPR(v4, LLDB_INVALID_REGNUM), - DEFINE_VPR(v5, LLDB_INVALID_REGNUM), - DEFINE_VPR(v6, LLDB_INVALID_REGNUM), - DEFINE_VPR(v7, LLDB_INVALID_REGNUM), - DEFINE_VPR(v8, LLDB_INVALID_REGNUM), - DEFINE_VPR(v9, LLDB_INVALID_REGNUM), - DEFINE_VPR(v10, LLDB_INVALID_REGNUM), - DEFINE_VPR(v11, LLDB_INVALID_REGNUM), - DEFINE_VPR(v12, LLDB_INVALID_REGNUM), - DEFINE_VPR(v13, LLDB_INVALID_REGNUM), - DEFINE_VPR(v14, LLDB_INVALID_REGNUM), - DEFINE_VPR(v15, LLDB_INVALID_REGNUM), - DEFINE_VPR(v16, LLDB_INVALID_REGNUM), - DEFINE_VPR(v17, LLDB_INVALID_REGNUM), - DEFINE_VPR(v18, LLDB_INVALID_REGNUM), - DEFINE_VPR(v19, LLDB_INVALID_REGNUM), - DEFINE_VPR(v20, LLDB_INVALID_REGNUM), - DEFINE_VPR(v21, LLDB_INVALID_REGNUM), - DEFINE_VPR(v22, LLDB_INVALID_REGNUM), - DEFINE_VPR(v23, LLDB_INVALID_REGNUM), - DEFINE_VPR(v24, LLDB_INVALID_REGNUM), - DEFINE_VPR(v25, LLDB_INVALID_REGNUM), - DEFINE_VPR(v26, LLDB_INVALID_REGNUM), - DEFINE_VPR(v27, LLDB_INVALID_REGNUM), - DEFINE_VPR(v28, LLDB_INVALID_REGNUM), - DEFINE_VPR(v29, LLDB_INVALID_REGNUM), - DEFINE_VPR(v30, LLDB_INVALID_REGNUM), - DEFINE_VPR(v31, LLDB_INVALID_REGNUM), +static lldb_private::RegisterInfo g_register_infos_riscv64_vpr[] = { + DEFINE_VPR(v0, LLDB_INVALID_REGNUM), DEFINE_VPR(v1, LLDB_INVALID_REGNUM), + DEFINE_VPR(v2, LLDB_INVALID_REGNUM), DEFINE_VPR(v3, LLDB_INVALID_REGNUM), + DEFINE_VPR(v4, LLDB_INVALID_REGNUM), DEFINE_VPR(v5, LLDB_INVALID_REGNUM), + DEFINE_VPR(v6, LLDB_INVALID_REGNUM), DEFINE_VPR(v7, LLDB_INVALID_REGNUM), + DEFINE_VPR(v8, LLDB_INVALID_REGNUM), DEFINE_VPR(v9, LLDB_INVALID_REGNUM), + DEFINE_VPR(v10, LLDB_INVALID_REGNUM), DEFINE_VPR(v11, LLDB_INVALID_REGNUM), + DEFINE_VPR(v12, LLDB_INVALID_REGNUM), DEFINE_VPR(v13, LLDB_INVALID_REGNUM), + DEFINE_VPR(v14, LLDB_INVALID_REGNUM), DEFINE_VPR(v15, LLDB_INVALID_REGNUM), + DEFINE_VPR(v16, LLDB_INVALID_REGNUM), DEFINE_VPR(v17, LLDB_INVALID_REGNUM), + DEFINE_VPR(v18, LLDB_INVALID_REGNUM), DEFINE_VPR(v19, LLDB_INVALID_REGNUM), + DEFINE_VPR(v20, LLDB_INVALID_REGNUM), DEFINE_VPR(v21, LLDB_INVALID_REGNUM), + DEFINE_VPR(v22, LLDB_INVALID_REGNUM), DEFINE_VPR(v23, LLDB_INVALID_REGNUM), + DEFINE_VPR(v24, LLDB_INVALID_REGNUM), DEFINE_VPR(v25, LLDB_INVALID_REGNUM), + DEFINE_VPR(v26, LLDB_INVALID_REGNUM), DEFINE_VPR(v27, LLDB_INVALID_REGNUM), + DEFINE_VPR(v28, LLDB_INVALID_REGNUM), DEFINE_VPR(v29, LLDB_INVALID_REGNUM), + DEFINE_VPR(v30, LLDB_INVALID_REGNUM), DEFINE_VPR(v31, LLDB_INVALID_REGNUM), }; #endif // DECLARE_REGISTER_INFOS_RISCV64_STRUCT diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp index 5ba18cdb9889a..3dca4b1609905 100644 --- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp +++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp @@ -16,9 +16,17 @@ std::unique_ptr RegisterContextCorePOSIX_riscv64::Create(Thread &thread, const ArchSpec &arch, const DataExtractor &gpregset, llvm::ArrayRef notes) { + Flags opt_regsets = RegisterInfoPOSIX_riscv64::eRegsetMaskDefault; + + DataExtractor fpregset = getRegset(notes, arch.GetTriple(), FPR_Desc); + if (fpregset.GetByteSize() >= sizeof(uint64_t)) { + opt_regsets.Set(RegisterInfoPOSIX_riscv64::eRegsetMaskFP); + } + return std::unique_ptr( new RegisterContextCorePOSIX_riscv64( - thread, std::make_unique(arch, Flags()), + thread, + std::make_unique(arch, opt_regsets), gpregset, notes)); } @@ -27,17 +35,14 @@ RegisterContextCorePOSIX_riscv64::RegisterContextCorePOSIX_riscv64( const DataExtractor &gpregset, llvm::ArrayRef notes) : RegisterContextPOSIX_riscv64(thread, std::move(register_info)) { - m_gpr_buffer = std::make_shared(gpregset.GetDataStart(), - gpregset.GetByteSize()); - m_gpr.SetData(m_gpr_buffer); + m_gpr.SetData(std::make_shared(gpregset.GetDataStart(), + gpregset.GetByteSize())); m_gpr.SetByteOrder(gpregset.GetByteOrder()); - ArchSpec arch = m_register_info_up->GetTargetArchitecture(); - DataExtractor fpregset = getRegset(notes, arch.GetTriple(), FPR_Desc); - m_fpr_buffer = std::make_shared(fpregset.GetDataStart(), - fpregset.GetByteSize()); - m_fpr.SetData(m_fpr_buffer); - m_fpr.SetByteOrder(fpregset.GetByteOrder()); + if (m_register_info_up->IsFPPresent()) { + ArchSpec arch = m_register_info_up->GetTargetArchitecture(); + m_fpr = getRegset(notes, arch.GetTriple(), FPR_Desc); + } } RegisterContextCorePOSIX_riscv64::~RegisterContextCorePOSIX_riscv64() = default; diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h index 3cf9531df2c1d..a9a5984463574 100644 --- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h +++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h @@ -50,9 +50,6 @@ class RegisterContextCorePOSIX_riscv64 : public RegisterContextPOSIX_riscv64 { bool WriteFPR() override; private: - lldb::DataBufferSP m_gpr_buffer; - lldb::DataBufferSP m_fpr_buffer; - lldb_private::DataExtractor m_gpr; lldb_private::DataExtractor m_fpr; }; diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py index 0b9d17bc9f45e..7e8531c88bf34 100644 --- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py +++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py @@ -21,7 +21,8 @@ class LinuxCoreTestCase(TestBase): _x86_64_pid = 32259 _s390x_pid = 1045 _ppc64le_pid = 28147 - _riscv64_pid = 89328 + _riscv64_gpr_fpr_pid = 1089 + _riscv64_gpr_only_pid = 97 _aarch64_regions = 4 _i386_regions = 4 @@ -61,9 +62,25 @@ def test_s390x(self): self.do_test("linux-s390x", self._s390x_pid, self._s390x_regions, "a.out") @skipIfLLVMTargetMissing("RISCV") - def test_riscv64(self): + def test_riscv64_gpr_fpr(self): """Test that lldb can read the process information from an riscv64 linux core file.""" - self.do_test("linux-riscv64", self._riscv64_pid, self._riscv64_regions, "a.out") + self.do_test( + "linux-riscv64.gpr_fpr", + self._riscv64_gpr_fpr_pid, + self._riscv64_regions, + "a.out", + ) + + @skipIfLLVMTargetMissing("RISCV") + def test_riscv64_gpr_only(self): + """Test that lldb can read the process information from an riscv64 linux core file + made for a RV64IMAC target, having no FP-registers.""" + self.do_test( + "linux-riscv64.gpr_only", + self._riscv64_gpr_only_pid, + self._riscv64_regions, + "a.out", + ) @skipIfLLVMTargetMissing("X86") def test_same_pid_running(self): @@ -668,46 +685,47 @@ def test_arm_core(self): self.expect("register read --all") @skipIfLLVMTargetMissing("RISCV") - def test_riscv64_regs(self): + def test_riscv64_regs_gpr_fpr(self): # check basic registers using 64 bit RISC-V core file target = self.dbg.CreateTarget(None) self.assertTrue(target, VALID_TARGET) - process = target.LoadCore("linux-riscv64.core") + process = target.LoadCore("linux-riscv64.gpr_fpr.core") values = {} - values["pc"] = "0x000000000001015e" - values["ra"] = "0x000000000001018c" - values["sp"] = "0x0000003fffd132a0" - values["gp"] = "0x0000002ae919af50" - values["tp"] = "0x0000003fdceae3e0" - values["t0"] = "0x0" - values["t1"] = "0x0000002ae9187b1c" - values["t2"] = "0x0000000000000021" - values["fp"] = "0x0000003fffd132d0" - values["s1"] = "0x0000002ae919cd98" + values["pc"] = "0x000000000001016e" + values["ra"] = "0x00000000000101a4" + values["sp"] = "0x0000003fffc1d2d0" + values["gp"] = "0x0000002ae6eccf50" + values["tp"] = "0x0000003ff3cb5400" + values["t0"] = "0x7f7f7f7fffffffff" + values["t1"] = "0x0000002ae6eb9b1c" + values["t2"] = "0xffffffffffffffff" + values["fp"] = "0x0000003fffc1d300" + values["s1"] = "0x0000002ae6eced98" values["a0"] = "0x0" values["a1"] = "0x0000000000010144" - values["a2"] = "0x0000002ae919cdb0" - values["a3"] = "0x000000000000002f" - values["a4"] = "0x000000000000002f" + values["a2"] = "0x0000002ae6ecedb0" + values["a3"] = "0xafdbdbff81cf7f81" + values["a4"] = "0x00000000000101e4" values["a5"] = "0x0" - values["a6"] = "0x7efefefefefefeff" + values["a6"] = "0x2f5b5a40014e0001" values["a7"] = "0x00000000000000dd" - values["s2"] = "0x0000002ae9196860" - values["s3"] = "0x0000002ae919cdb0" - values["s4"] = "0x0000003fffc63be8" - values["s5"] = "0x0000002ae919cb78" - values["s6"] = "0x0000002ae9196860" - values["s7"] = "0x0000002ae9196860" + values["s2"] = "0x0000002ae6ec8860" + values["s3"] = "0x0000002ae6ecedb0" + values["s4"] = "0x0000003fff886c18" + values["s5"] = "0x0000002ae6eceb78" + values["s6"] = "0x0000002ae6ec8860" + values["s7"] = "0x0000002ae6ec8860" values["s8"] = "0x0" values["s9"] = "0x000000000000000f" - values["s10"] = "0x0000002ae919a8d0" + values["s10"] = "0x0000002ae6ecc8d0" values["s11"] = "0x0000000000000008" - values["t3"] = "0x0000003fdce07df4" + values["t3"] = "0x0000003ff3be3728" values["t4"] = "0x0" - values["t5"] = "0x0000000000000020" - values["t6"] = "0x0000002ae919f1b0" + values["t5"] = "0x0000000000000002" + values["t6"] = "0x0000002ae6ed08b9" values["zero"] = "0x0" + values["fa5"] = "0xffffffff423c0000" values["fcsr"] = "0x00000000" fpr_names = { @@ -728,7 +746,7 @@ def test_riscv64_regs(self): "fa2", "fa3", "fa4", - "fa5", + # fa5 is non-zero and checked in the list above. "fa6", "fa7", "fs0", @@ -760,6 +778,61 @@ def test_riscv64_regs(self): self.expect("register read --all") + @skipIfLLVMTargetMissing("RISCV") + def test_riscv64_regs_gpr_only(self): + # check registers using 64 bit RISC-V core file containing GP-registers only + target = self.dbg.CreateTarget(None) + self.assertTrue(target, VALID_TARGET) + process = target.LoadCore("linux-riscv64.gpr_only.core") + + values = {} + values["pc"] = "0x0000000000010164" + values["ra"] = "0x0000000000010194" + values["sp"] = "0x00fffffff4d5fcc0" + values["gp"] = "0x0000000000157678" + values["tp"] = "0x00ffffff99c43400" + values["t0"] = "0x00ffffff99c6b260" + values["t1"] = "0x00ffffff99b7bd54" + values["t2"] = "0x0000000003f0b27f" + values["fp"] = "0x00fffffff4d5fcf0" + values["s1"] = "0x0000000000000003" + values["a0"] = "0x0" + values["a1"] = "0x0000000000010144" + values["a2"] = "0x0000000000176460" + values["a3"] = "0x000000000015ee38" + values["a4"] = "0x00000000423c0000" + values["a5"] = "0x0" + values["a6"] = "0x0" + values["a7"] = "0x00000000000000dd" + values["s2"] = "0x0" + values["s3"] = "0x000000000014ddf8" + values["s4"] = "0x000000000003651c" + values["s5"] = "0x00fffffffccd8d28" + values["s6"] = "0x000000000014ddf8" + values["s7"] = "0x00ffffff99c69d48" + values["s8"] = "0x00ffffff99c6a008" + values["s9"] = "0x0" + values["s10"] = "0x0" + values["s11"] = "0x0" + values["t3"] = "0x00ffffff99c42000" + values["t4"] = "0x00ffffff99af8e20" + values["t5"] = "0x0000000000000005" + values["t6"] = "0x44760bdd8d5f6381" + values["zero"] = "0x0" + + for regname, value in values.items(): + self.expect( + "register read {}".format(regname), + substrs=["{} = {}".format(regname, value)], + ) + + # Check that LLDB does not try to read other registers from core file + self.expect( + "register read --all", + matching=False, + substrs=["registers were unavailable"], + ) + def test_get_core_file_api(self): """ Test SBProcess::GetCoreFile() API can successfully get the core file. diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.core b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.core similarity index 76% rename from lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.core rename to lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.core index 0b159fcab931dca85bd5d09c282c2d6c217e2b83..7187b4000c56dc8ca74aec484310746a0fdd1e9c 100644 GIT binary patch delta 1412 zcmchV?@N# zpsaD<1VTuJy@*U0K|u_KA%%i7O$CbzMDQ0G_NFM;b9c`L?GNa{<#W&Xo^$Sf&U1o; zTyT(k(8~P?`&#+9VJ`f;IJ~aqzmQ$2YIH17h47dn3)geCff^-3lL~|w5!hfl+fi2M zal3@+_%0W@#;M=I(YPnT_v%ZErv)5I3>6S4(}6%4E;4~Rev&CrC08<0Sf zWV1uT5zzYrQ+EByrWJTTGuoXbsUiEHAe`lzheL^h&3So zmKM;!LGjUO@(Wm;+`Oc{)@&@}1%;5a!Xrc$Ybh{N$Y*loci8Ko!5F0PN?pu zRN6?NaYjlgQlMVa;09jvSf{Icm7gXz51}`!b))2)ND#5j-e*HOvcETP;Uc-FB`SFj zKGmXA&>-1Ytp3cwHN0qIG0{e8KZ#{A%GoW`PJc&^k>#+uB#s!Hx83i;+xo_IRyH@d z{my?oJK{5(HQ*6m)W&Qy7a?J03{TjKQYUQG9*Q&SGqLnez{uOD*R++dHqK zX1mMlad-CS2rg{{%IrmQfjzeQb90KApWqx=GCD`ou1Q%su z>L24cdC>zLFpb7bC8hdd-pn?M6O%v2iO)>ikt^#u5Jn*W=4%?y;=&>n9D)}Hc*Dp7 z<14ategeVYKnt0Qs|1D$_Kb@rZ-)ltJ@DqD6=@ya$KA=TXqAkqk?RbKrD?yEgY^i4*aIWLx%$V0~lYlp)v@DmvbBI%o*!Th6xOU&;M=PfA~GL&{X@oy^SAqk~H- z^5OapF52!Z5Q>A1JsphZNcx2gUx@4xL5g?r;e2#!fgVKiLw7qfqzN$^SWGnDZy~WP z=GDGG+SUH596ifnbx9mCIz$L1o(r$=9(6Zt&CU0iTpty&fX=T=tEXwwuXz;z-n&_a zLcE#w<`L$~H->$!FgJ5iI&f@<&(K0v;2Zc0!0ZxQW-D+v&;hIko&lFgi$P7rg(Ev%&s&@dF2ZiJ;Re(oo}tZk2UYFn6K&S(4tze-D9KkjJ)0oUwNg=UR7f=3MG1@ zVAUH<(qrX|%~p$mO%8WHgt{xm>O-Q<(9qHxG*As+cGPhKo^tF|K~O_mTPyzQD8vVx zCJj0)I(=Td*Xwdu`COvSY_#Ir&NG}4TVK(hqNXYT{!+Nt&1=Y}{OXgGnDH~Wu*ojE QYTO=Mdt)Y^o* z6l*2o2V_yNls7lehS&=(LhMzQhNTHlW+3W|5ftl+_pBL2aXfQ4<3f= zw_X_>y!6}UBNtEI{p*KsjGyN_3bV(0Z$szi-5=dO9|1UbrayM(lL+*0z5Va09jBKz zBGwVh+cJj5^V@CfIjAmOz$DGTm-oXT@=T)V?YZ!CWIH-leZ&gR3|1`YN5EWmEeKVkkhE5;?y9RhZO zu}v)!B$3sLd3bqwc{@tTwx+W4Qg&Swia)NCQrT8;=W{ojo<`0P=Zw5A&4-9pxRUkr z@@PMTyp+dF)_A(hOH#b?{rh(%9~v=qQ!6LagGxq84P*vw`1T<)Q%S3(OeLjiq!luijGCt+N7$U1tJXt*$9?O)5UZp1&?AF zZ*N7ZdoaQ`(7SGJb#Wb`Fffx`hmVQ)B2hYB_3Tl}pxTzN$Z+;K3>Fi?WKyDJh!Gd$&#-9L zH5(Xf5&d+R7IC)VhGilw26FwWSHOlqj_&GVkH>fR#13>wo!C|w)gdx+cy@t){` z4qT9SwxzWrE_ZM0ZfENwiH$f}3VX5(9pV@w-rq&=-p&(U4B1V65RX3*mvQ|g3d439 zaH0>eewQ2z1GUh+nmE^mqqu-M3{iN5M#R`Ztt}+zR=|u(H={L@Shv*XEjQNCgF#FZ z6Vefm$t>0k49f(tFv$&AlVee=k&RH-q{sr1ftCOt@YJ2$n*-Pt9rkk5Hr7)x*eCI=>=P_#4$bjvD% zT6Hwr(+wL+npsv06CPwzXuusj7jvqLvs#woXOsJy=M_gU6 zR{3dA)q0wd*BlL0ql_m(sm_)hO-Je2o}3a|*>VhJvOKNm(`GS0VCF$_j2Qs3t7g)5eA+~;+%_rQ(vG7+;MeD!5ZyAaZ<&7Fv7QG@VON2h#>Ti z5mn&g-nh#Bz4Id2-k)xzhCh~ozqnr(x&OSMjC$~f7~}rp0Ty)epWG?{E_qnD&jdY- znf|uiTodA-OK-P734eiUs01;x-Q*Jz>{pOW4F@su@1c(B9l|k+av;;9CC-_2t zTJ!XjAoneVgBUTt=Kc$wTdfRo-$4su!EqkuPXc~5>XPsm@*-Lr`CsAw_%EqT!p|?o v-t!Z$a{r^;h+h$C$1L`bh)XXEpK8B540Gb=b?EB3qL_{Rf93v{@8SO+vIc^W literal 0 HcmV?d00001 diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core new file mode 100644 index 0000000000000000000000000000000000000000..63265a5db123d9ebc699a927d7477a264b558e91 GIT binary patch literal 28672 zcmeHQeQ*=U72lJvBuoK=3Ni&!5dwDjkR?OF0i5^)AxI3kHeb_*2>B#iDz@a2WP>xb z7}VJAOeheg*r84#Wd<{8n^4>t(@u#YA;7d@l1|$|r;tJ&aN0C{G{uG#P~W@TCyQ&C zGG*H4k6k#uk9}|V?Qh>6d(7V4MrX+aog^_TgV-l*Kw_+bDpF_x%VTL=9>nRYRHqc% zgJh7RLl(`IHZrK`Zf3NXF!)y?sx8Y zf4`Uhu&?zHEXG^1Xvy+bK!@-qkfidY_5kxVTtrFV_!x02-TmC1a_E6O{$zo;2SlqHzx6w>DREOyZj#+w z&JL-cPll*(feU*pURvf<^kvE~W5xo`sD!XjQd_7lnqpzg@u0?3lrPi|=j^S*1LS3_8=JH>;JYcDE82?PToF$7E(3~anhrr@vXBo*F#W}|xs`06L ztu`ip1p)#Afq+0jARrJB2nYlO0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C z0fB%(Kp-Fx5C{ka1Ofs9fq+0jARrJB2nYl|w+L+1od4I~<@|ppKd_1?N7eKH1#r}M zIG(Al#WS-XY7e~f5m#zPPA5;1(wSS0i6T;aMCn|w4r;N#6Z}B0mfCp-MCsfv`ry^V z{j_u*)L|itwpdCpM7_vg=L#)-NU>s!C^u8in&TN^><-ToC(kEydm1~&M?N??g&J|%C z2$VH&q5ikLecnC8WBI5*q0uw9yWSfKxx-O+Bv2LhMeD*r(#h2!d9Az37mWsLe5B-J zsM70cAYIZh?2C{N?Bv=yk}9j~B7S#3_Q-x8s}2O~)=$T7ZSdEjzQ^I7X>&XRK|;~p z@kiB3Fmw3l;HJ>NfVKV;vU66AR`5>Tv#DOO~^FtxxPpoBHJS z4g0^|mcC+Y_Q<~+IEnc{xl)3$fy{JjX3=?a>Wl2?y_{>e6CpI;#^;yPU>~0k%FlOs zJbNA|>SMEK`Hq?m zhBXIT)3SOl_1)H4r1;OY05|}r@=1!M0L*)WB=7I)hnx7PPYu1;Zl2C z-M!sq=}jce#;OL{aE+;aM21fAK4T9WS9Z*R72ioep@cA@?JrbCh|&_#1` z>kwg7*i9Ly&d9Nqi;uJdvax;HE6x3xETCZ23UdQOEY~Euoy_^ZlgU)WptO7iofBu7lhcajToa+3@ zPW@xekU7neNsv8xn2T0j>t*nG?Q&tym-_x;wm{xA_ZCX=&i+%d_Ih<{?&twK%$_&e z{swEXib3{V?bH2h_Lwt`I$XI6+HSzCp6sURUrx>?+&#{B7}Pbs_KmGudT@=7f3wSy5BD77woPxrs$;3EJ=W1Q472JiAZ698d&c6XMg_gi zjmXP7OtN)Gmu;BaM$G4+T9-HWaOR+$Q7X)hss65 zs;bf#lzm{0)&4**TI;Ec_!P0Grmh-{nMq0?;wmnlW6dsGRPMCZJ7(B&?YT3i+a1## zk34L3%(U5Ub}OnJxsFGz*}nC)zA$W9gHca4ygg})_#@G9)KkH1Rlz!&-xKjOo3|kt zX{b?YG_2Ot`@)exD0qk9_A*;A6!j^Vo|=HnY(BrcGVH1Gx&2;nB#GIgzV%Ts^F%$& zrr=@oM5EzAMO_s3#5OrpgAoBh_gUb+t32P){*LycMDuw2wB4h96zyAazu5nYTP%FrtP1cu2`+f2 zIT=6gYl)8JmNb6qhiER;;~tr0+BXv={Iv1&W?3OHu*FQqf4ApdvM1b0b^(LW1QZ{w zBcgUbVDcmUXdhe(zGy=6-|e{>Oq4?Hko+_-ss8lbO4Lh+oKpX!_H%!Fu1QWXH+)~y zp+;mu?oZ=C$Dc=uYWpLZC@vxJt)+N~(*I{vl7dqXXHwe)9O~>zjmj+HFA~qu=*d5o zQtYXwE!_X#itUp>JT( zx2rz8|F??@xcx8qcYW@(d%dW%Mg3M)=2z7>DCxYdtobg_S}r$m`aGxYobKWDRZb6b zdYIE5PO1M1#E9=NDM)j4I;$^cyO5Ao!t)1?kuyj zO=(EP0`Vul5ECVe7>&lC#0L|7@I@t>$b%0?UwAe#G0_Bwydl@qFEIw9Nbo06?<$v#am6?79z(h@*a-n_qtFd9KqY{(s}cwL$cV2m zA`uNX=^8ZvdG1e84CWS!MK#9ZF+NT~2R#@FN?0n-{OaV}JoZ1eJe(VA4O-wlVd~(s zu&w*SL+d?!;GVqw!o9^e?xR^x+aDW$gD82Oa#7cc zuU(v4!zORudgm^KUruE5LG(&f`Yr>o2kQ@5zsiX5%L|8ron>ror{u0lA0iJeEiG*$ zlT24~;sxoYK&B%M&t1!0?~sz2E^u*Us6K)?#g9ADjGIOMal{RIxLDoaaTgaOe^dMR z?T!r{o}QUh=2F8-I+-4hCsXm%_N}qhh>}#2F%qTHsa>%leZHdGMoBL_nziMRsk7O!rEC7%K>JmnY&9xf>!I!qdID;yrA0S8qffSUb?eR)?Yw z<64I&_DEs_Qn$C=*h32j?UGoK-e5##?H$0dT?H&ilzOt; zBV81w!3Hg&8nj`b7H!cIgJ&(P1O?O59J&-@`%*g|kjb%lNy{30&C2KqvPM;lS1WpM zZ#tRUg?T?rYCAW(*G=w{mW#!UpjI8tc7Q_ZHk34@tQIP^l7qx7N)j4M7tAx-DQhl4 zM=L>RNJ*uY)UfKFn9-5AX_Ra83DcY_#VfXTQqMWq$!8OeUUd@Vho%#SS~=$!R#`I* z$Iz>ZidA(=mhI@Jgl-n%Im_1FRsHuy#vO_A^ln7TnBHP5n&mQFZ;21Ujh6CYqj{Z| zq4kX}L#U2vWMs^yRz(K>xcQVa;AXcdvU@u$SIE$@Oi1|}fZI!c1tFz|^$};+ySJCG z8(oBi_g^DBP#z`!ef&Y>fu>zKzJAeW`AL+oMnC@h>(~l-+W#oMf<8Z|SCF4uLH^uhlTffqbosH$2tt*mCM70W0)x;==JnThcu<7#GdYWkq6Vm~u8qfQ>2aQSSt z%FmOkHpBG1=4hbmWxUvx>Y0+GWsy3zM`s0Bwj5n4mTOA3W|;Z7kq5=m=N(XN%XOgW zvueS{LkUXGD$)CeDtfpS&2elaTf;L4N?y;_imGO7<>LLWQ7%|@RW@tubE1dm7yWrg zI1s^-7-wVSlK$_q7Cq2u5m%@<$KroA=!<}v&{XI-qYM)#QJnV)o_32(==*VmnEr~gh zA71JIEASdJ!l#fC{fRG#e_Vbn1@_CxUZM{xmYW`_w|?wd)c9otzRm4Fu3LJHANvZ5 z{W3zn$v5hXXI5|h*w;}*GXFeB`IYZaa$zs%yDQj##O)XAkv&G(HBym#jnE;U&_}rv se>Bj8RpbwTo7_S#2VW{K@yIz2^yi<5E5PKlm3;?)l0FkaPW|@(0Q3BfZ~y=R literal 0 HcmV?d00001 diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out deleted file mode 100755 index ae28aa9ba9aad9700b193281fa8045587c344187..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3328 zcmbtW-ESL35TCuv*-o4|P6KM93dK#+4-mQ7Zs-?nN!_GrB@{seQp-oLXZswx_G)g^p$EPvj88ld zjrV?9y7=O)UEfc9_r=Kz-&9|I^~+N;XZshSXUqN%{<;=TMQ%kG|9*Yn zpD~;L-k5p$(#+Y&V*i5A+p^~~~UZh7~9mLKnmB|gHL&BX=xiu$R*ObIVIN~ zgO|Q-SJJrz0Oe;tK4*~gxZsSwrp&bv>(tk|Ag_q_8uE7dywo!91 zTKS4uOJ#QISv@^C>{{n$oJQtR0l=Z-7{RSH3g>_wMYScVx#xBbSF-htVQ1@U!z8Vg zt!IrQbp^s^D}{#ZDXn8$*r;2D1KD(DZzh}BNp`nT zKEThmZI+4!{}5yEpWxKs6yzPp=AGq&(eO;ygF+d1*fuLQy#PaH$F_!wyq<=vYH6_G zxE5yM-hJ~m$zn+O?RNS8W1BDB294Gv$ajdeO|ArAU-l7 zqYAhQ>ogVOYfw5dE%5tN*g!KXfjr3SY3IbX`I!*>a3RwTmZbPSG}B6>J2FXMiR5bh zD&iY(-tZ0LHQ`@sg1^xOf42#Kvk5NG7hNU;=O6U;rQ6Ohrm9ZftQvKvQu8c#0FlYj zL(d&Da%1DiCJY0|xyeamY+{t_`9?#Wcf(jsvx=r?f??G#nR;W!_RKs=&-G`n1KlbcCD*hq&!ENC`$5|^~xN;8)|MUulYx2KsYY3OsA>}yQ=^rvg zTmGMe_OUOP`!M%1 zTEZBS=fp<8zwGHNVeXp<6Bd5YZQ`narHQ|c^L7*edEw80j{-5IUr@@k6QIGT!hb{< oJJ6WKN1h-25#fz)e32@F_+mkPIzC#3uFX|f6aVXCqW{YOKOsPGG5`Po diff --git a/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c b/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c new file mode 100644 index 0000000000000..bcfe6d27359ca --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c @@ -0,0 +1,14 @@ +static void bar(float *boom) { + float F = 98.0; + *boom = 47.0; // Frame bar +} + +static void foo(float *boom, void (*boomer)(float *)) { + float F = 102.0; + boomer(boom); // Frame foo +} + +void _start(void) { + float F = 95.0; + foo(0, bar); // Frame _start +} From 2d7339ad24b41eb06c417f7067b9fbeb4fdb2e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Wed, 4 Sep 2024 16:41:43 +0200 Subject: [PATCH 106/425] [AMDGPU][LDS] Fix dynamic LDS interaction with "amdgpu-no-lds-kernel-id" (#107092) Dynamic lds and Table lds both use the amdgpu_lds_kernel_id intrinsic. Kernels and functons that make an indirect use of this should not have the "amdgpu-no-lds-kernel-id" attribute. For the later, this was done. For the dynamic lds case, this was missing. This patch fixes it. --- .../Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 15 ++++++++------- .../AMDGPU/lower-module-lds-zero-size-arr.ll | 9 ++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 38f0b6dda1997..a166087a5cdc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1010,13 +1010,6 @@ class AMDGPULowerModuleLDS { M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, LookupTable); - - // Strip amdgpu-no-lds-kernel-id from all functions reachable from the - // kernel. We may have inferred this wasn't used prior to the pass. - // - // TODO: We could filter out subgraphs that do not access LDS globals. - for (Function *F : KernelsThatAllocateTableLDS) - removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"}); } DenseMap KernelToCreatedDynamicLDS = @@ -1024,6 +1017,14 @@ class AMDGPULowerModuleLDS { KernelsThatIndirectlyAllocateDynamicLDS, DynamicVariables, OrderedKernels); + // Strip amdgpu-no-lds-kernel-id from all functions reachable from the + // kernel. We may have inferred this wasn't used prior to the pass. + // TODO: We could filter out subgraphs that do not access LDS globals. + for (auto *KernelSet : {&KernelsThatIndirectlyAllocateDynamicLDS, + &KernelsThatAllocateTableLDS}) + for (Function *F : *KernelSet) + removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"}); + // All kernel frames have been allocated. Calculate and record the // addresses. { diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll index c7829be565373..59dfe3300c293 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll @@ -12,7 +12,7 @@ ;. define void @fn(float %val, i32 %idx) #0 { ; CHECK-LABEL: define void @fn( -; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VAR0]], align 4 @@ -28,7 +28,7 @@ define void @fn(float %val, i32 %idx) #0 { define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 { ; CHECK-LABEL: define amdgpu_kernel void @kernelA( -; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds) ] ; CHECK-NEXT: tail call void @fn(float [[VAL]], i32 [[IDX]]) ; CHECK-NEXT: ret void @@ -40,9 +40,8 @@ define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 { attributes #0 = { "amdgpu-no-lds-kernel-id" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-lds-kernel-id" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} ; CHECK: [[META1]] = !{i32 0} From 865edb0436bc55a3df3596eefb9a83050a5c7a96 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 4 Sep 2024 15:07:40 +0000 Subject: [PATCH 107/425] [mlir][ArmSME] Fix typo (NFC) --- mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td index d847dda5ae9f9..9a058ae4fe764 100644 --- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td +++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td @@ -605,7 +605,7 @@ def InsertTileSliceOp : ArmSME_Op<"insert_tile_slice", [ ]> { let summary = "Insert 1-D scalable vector into slice of 2-D tile"; let description = [{ - Inserts a 1-D scalable vector to a slice of a 2-D scalable vector tile at + Inserts a 1-D scalable vector into a slice of a 2-D scalable vector tile at the given index. The type of the 1-D scalable vector to be inserted must match the type of the tile slice. A tile slice is a 1-D vector of horizontally or vertically contiguous elements within a ZA tile. The updated From ba40737e819b4ca77b25c0950c47c305a15a93de Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 11:17:05 -0400 Subject: [PATCH 108/425] [libc++][modules] Include __type_traits/invoke.h from __type_traits/result_of.h (#106796) The result_of trait requires the __invoke_of implementation detail, but that is defined under __type_traits, not under __functional. Include the right header directly to remove a dependency from __type_traits to __functional. --- libcxx/include/__type_traits/result_of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h index f00fa8e9be7f7..73a1944752066 100644 --- a/libcxx/include/__type_traits/result_of.h +++ b/libcxx/include/__type_traits/result_of.h @@ -10,7 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_RESULT_OF_H #include <__config> -#include <__functional/invoke.h> +#include <__type_traits/invoke.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header From d9019d478d40b4e8766efccdb3eb1ff77cdbfaec Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 11:17:32 -0400 Subject: [PATCH 109/425] [libc++] Remove unused pair.h include from hypot.h (#106798) This was added in #100820 by mistake since the final version of that PR didn't depend on std::pair anymore. --- libcxx/include/__math/hypot.h | 1 - libcxx/test/libcxx/transitive_includes/cxx03.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx11.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx14.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx17.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx20.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx23.csv | 1 - libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 - 8 files changed, 8 deletions(-) diff --git a/libcxx/include/__math/hypot.h b/libcxx/include/__math/hypot.h index b992163711010..2c2c9c38ab530 100644 --- a/libcxx/include/__math/hypot.h +++ b/libcxx/include/__math/hypot.h @@ -18,7 +18,6 @@ #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_same.h> #include <__type_traits/promote.h> -#include <__utility/pair.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index fd2cd7f1c2d96..fd47c65ffc307 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -132,7 +132,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath type_traits diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index 04122fd0f4571..347d5d8796687 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -132,7 +132,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath type_traits diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index 42c742bead0c1..57bb2f25b3d62 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -133,7 +133,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath type_traits diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index bc0659127d4bf..6826aeb75a83b 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -133,7 +133,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath type_traits diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index fed0944f0219c..17169e385d544 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -138,7 +138,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath type_traits diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 53f99384a7f57..267dca3cc6c41 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -85,7 +85,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 53f99384a7f57..267dca3cc6c41 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -85,7 +85,6 @@ chrono vector chrono version cinttypes cstdint cmath cstddef -cmath cstdint cmath initializer_list cmath limits cmath version From 7a785d46d6c31937c620f186464fdc59c265b4bf Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 11:18:00 -0400 Subject: [PATCH 110/425] [libc++][modules] Use inline variable instead of true_type (#106797) This allows breaking up a dependency from __fwd/array.h onto __type_traits, which is a circular dependency once __type_traits becomes a module of its own. This is also a small consistency improvement since we've been using inline variables for traits like this elsewhere in the library. --- libcxx/include/__fwd/array.h | 6 +++--- libcxx/include/span | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h index b429d0c5a9542..6c6461e727604 100644 --- a/libcxx/include/__fwd/array.h +++ b/libcxx/include/__fwd/array.h @@ -35,11 +35,11 @@ template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT; #endif -template -struct __is_std_array : false_type {}; +template +inline const bool __is_std_array_v = false; template -struct __is_std_array > : true_type {}; +inline const bool __is_std_array_v > = true; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/span b/libcxx/include/span index da631cdc3f90e..a32f7a372e2ae 100644 --- a/libcxx/include/span +++ b/libcxx/include/span @@ -210,7 +210,7 @@ concept __span_compatible_range = ranges::contiguous_range<_Range> && // ranges::sized_range<_Range> && // (ranges::borrowed_range<_Range> || is_const_v<_ElementType>) && // - !__is_std_array>::value && // + !__is_std_array_v> && // !is_array_v> && // is_convertible_v> (*)[], _ElementType (*)[]>; From c1a8283fcc735b1567c49bb6cd485f9e71a12cc4 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 11:18:30 -0400 Subject: [PATCH 111/425] [libc++][modules] Move __noexcept_move_assign_container out of __type_traits (#107140) That header depends on allocator traits, which is fundamentally tied to ``, not to ``. This breaks a cycle betweeen __type_traits and __memory. --- libcxx/include/CMakeLists.txt | 2 +- .../noexcept_move_assign_container.h | 6 +++--- libcxx/include/module.modulemap | 2 +- libcxx/include/string | 2 +- libcxx/include/vector | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename libcxx/include/{__type_traits => __memory}/noexcept_move_assign_container.h (85%) diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 32579272858a8..210beaf5a3364 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -536,6 +536,7 @@ set(files __memory/construct_at.h __memory/destruct_n.h __memory/inout_ptr.h + __memory/noexcept_move_assign_container.h __memory/out_ptr.h __memory/pointer_traits.h __memory/ranges_construct_at.h @@ -824,7 +825,6 @@ set(files __type_traits/maybe_const.h __type_traits/nat.h __type_traits/negation.h - __type_traits/noexcept_move_assign_container.h __type_traits/promote.h __type_traits/rank.h __type_traits/remove_all_extents.h diff --git a/libcxx/include/__type_traits/noexcept_move_assign_container.h b/libcxx/include/__memory/noexcept_move_assign_container.h similarity index 85% rename from libcxx/include/__type_traits/noexcept_move_assign_container.h rename to libcxx/include/__memory/noexcept_move_assign_container.h index baaf36d9980e9..b0063516aaafc 100644 --- a/libcxx/include/__type_traits/noexcept_move_assign_container.h +++ b/libcxx/include/__memory/noexcept_move_assign_container.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H -#define _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H +#ifndef _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H +#define _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H #include <__config> #include <__memory/allocator_traits.h> @@ -34,4 +34,4 @@ struct __noexcept_move_assign_container _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H +#endif // _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index f193b5d95f49f..297d155cb5594 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1531,6 +1531,7 @@ module std_private_memory_construct_at [system] { header "__m module std_private_memory_destruct_n [system] { header "__memory/destruct_n.h" } module std_private_memory_fwd [system] { header "__fwd/memory.h" } module std_private_memory_inout_ptr [system] { header "__memory/inout_ptr.h" } +module std_private_memory_noexcept_move_assign_container [system] { header "__memory/noexcept_move_assign_container.h" } module std_private_memory_out_ptr [system] { header "__memory/out_ptr.h" } module std_private_memory_pointer_traits [system] { header "__memory/pointer_traits.h" } module std_private_memory_ranges_construct_at [system] { header "__memory/ranges_construct_at.h" } @@ -2023,7 +2024,6 @@ module std_private_type_traits_make_unsigned [system module std_private_type_traits_maybe_const [system] { header "__type_traits/maybe_const.h" } module std_private_type_traits_nat [system] { header "__type_traits/nat.h" } module std_private_type_traits_negation [system] { header "__type_traits/negation.h" } -module std_private_type_traits_noexcept_move_assign_container [system] { header "__type_traits/noexcept_move_assign_container.h" } module std_private_type_traits_promote [system] { header "__type_traits/promote.h" } module std_private_type_traits_rank [system] { header "__type_traits/rank.h" } module std_private_type_traits_remove_all_extents [system] { header "__type_traits/remove_all_extents.h" } diff --git a/libcxx/include/string b/libcxx/include/string index 15c7a2f6b988b..5cb0693ad10bc 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -609,6 +609,7 @@ basic_string operator""s( const char32_t *str, size_t len ); #include <__memory/allocator_traits.h> #include <__memory/compressed_pair.h> #include <__memory/construct_at.h> +#include <__memory/noexcept_move_assign_container.h> #include <__memory/pointer_traits.h> #include <__memory/swap_allocator.h> #include <__memory_resource/polymorphic_allocator.h> @@ -629,7 +630,6 @@ basic_string operator""s( const char32_t *str, size_t len ); #include <__type_traits/is_standard_layout.h> #include <__type_traits/is_trivial.h> #include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/noexcept_move_assign_container.h> #include <__type_traits/remove_cvref.h> #include <__type_traits/void_t.h> #include <__utility/auto_cast.h> diff --git a/libcxx/include/vector b/libcxx/include/vector index 0f852e7f36c29..2442852c764a6 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -334,6 +334,7 @@ template requires is-vector-bool-reference // Since C++ #include <__memory/addressof.h> #include <__memory/allocate_at_least.h> #include <__memory/allocator_traits.h> +#include <__memory/noexcept_move_assign_container.h> #include <__memory/pointer_traits.h> #include <__memory/swap_allocator.h> #include <__memory/temp_value.h> @@ -348,7 +349,6 @@ template requires is-vector-bool-reference // Since C++ #include <__type_traits/is_allocator.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/noexcept_move_assign_container.h> #include <__type_traits/type_identity.h> #include <__utility/exception_guard.h> #include <__utility/forward.h> From 3d9abfc9f841b13825e3d03cfba272f5eeab9a3b Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 17:13:51 -0700 Subject: [PATCH 112/425] Consolidate all IR logic for getting the identity value of a reduction [nfc] This change merges the three different places (at the IR layer) for finding the identity value of a reduction into a single copy. This depends on several prior commits which fix ommissions and bugs in the distinct copies, but this patch itself should be fully non-functional. As the new comments and naming try to make clear, the identity value is a property of the @llvm.vector.reduce.* intrinsic, not of e.g. the recurrence descriptor. (We still provide an interface for clients using recurrence descriptors, but the implementation simply translates to the intrinsic which each corresponds to.) As a note, the getIntrinsicIdentity API does not support fminnum/fmaxnum or fminimum/fmaximum which is why we still need manual logic (but at least only one copy of manual logic) for those cases. --- llvm/include/llvm/Analysis/IVDescriptors.h | 3 - .../include/llvm/Transforms/Utils/LoopUtils.h | 8 +++ llvm/lib/Analysis/IVDescriptors.cpp | 46 --------------- llvm/lib/CodeGen/ExpandVectorPredication.cpp | 51 ++-------------- llvm/lib/Transforms/Utils/LoopUtils.cpp | 59 +++++++++++++++++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +-- llvm/unittests/Analysis/IVDescriptorsTest.cpp | 8 --- 7 files changed, 70 insertions(+), 113 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index e7e6c5c01ad4d..5d992faf99d27 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -155,9 +155,6 @@ class RecurrenceDescriptor { /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I); - /// Returns identity corresponding to the RecurrenceKind. - static Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF); - /// Returns the opcode corresponding to the RecurrenceKind. static unsigned getOpcode(RecurKind Kind); diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index ba8af4aa2b0cd..a761859465210 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -378,6 +378,14 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID); /// Returns the comparison predicate used when expanding a min/max reduction. CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK); +/// Given information about an @llvm.vector.reduce.* intrinsic, return +/// the identity value for the reduction. +Value *getReductionIdentity(Intrinsic::ID RdxID, Type *Ty, FastMathFlags FMF); + +/// Given information about an recurrence kind, return the identity +/// for the @llvm.vector.reduce.* used to generate it. +Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF); + /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index fdf78b9f8a44e..53001421ce6f7 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -1031,52 +1031,6 @@ bool RecurrenceDescriptor::isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop, return true; } -/// This function returns the identity element (or neutral element) for -/// the operation K. -Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF) { - switch (K) { - case RecurKind::Xor: - case RecurKind::Add: - case RecurKind::Or: - case RecurKind::Mul: - case RecurKind::And: - case RecurKind::FMul: - case RecurKind::FAdd: - return ConstantExpr::getBinOpIdentity(getOpcode(K), Tp, false, FMF.noSignedZeros()); - case RecurKind::FMulAdd: - return ConstantExpr::getBinOpIdentity(Instruction::FAdd, Tp, false, FMF.noSignedZeros()); - case RecurKind::UMin: - return ConstantInt::get(Tp, -1, true); - case RecurKind::UMax: - return ConstantInt::get(Tp, 0); - case RecurKind::SMin: - return ConstantInt::get(Tp, - APInt::getSignedMaxValue(Tp->getIntegerBitWidth())); - case RecurKind::SMax: - return ConstantInt::get(Tp, - APInt::getSignedMinValue(Tp->getIntegerBitWidth())); - case RecurKind::FMin: - case RecurKind::FMax: - assert((FMF.noNaNs() && FMF.noSignedZeros()) && - "nnan, nsz is expected to be set for FP min/max reduction."); - [[fallthrough]]; - case RecurKind::FMinimum: - case RecurKind::FMaximum: { - bool Negative = K == RecurKind::FMax || K == RecurKind::FMaximum; - const fltSemantics &Semantics = Tp->getFltSemantics(); - return !FMF.noInfs() - ? ConstantFP::getInfinity(Tp, Negative) - : ConstantFP::get(Tp, APFloat::getLargest(Semantics, Negative)); - } - case RecurKind::IAnyOf: - case RecurKind::FAnyOf: - llvm_unreachable("No meaningful identity for recurrence kind"); - default: - llvm_unreachable("Unknown recurrence kind"); - } -} - unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { switch (Kind) { case RecurKind::Add: diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index fe95a27f0a24f..ffe879ff04964 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -368,52 +368,11 @@ Value *CachingVPExpander::expandPredicationToFPCall( static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, Type *EltTy) { - bool Negative = false; - unsigned EltBits = EltTy->getScalarSizeInBits(); - Intrinsic::ID VID = VPI.getIntrinsicID(); - switch (VID) { - default: - llvm_unreachable("Expecting a VP reduction intrinsic"); - case Intrinsic::vp_reduce_add: - case Intrinsic::vp_reduce_or: - case Intrinsic::vp_reduce_xor: - case Intrinsic::vp_reduce_umax: - return Constant::getNullValue(EltTy); - case Intrinsic::vp_reduce_mul: - return ConstantInt::get(EltTy, 1, /*IsSigned*/ false); - case Intrinsic::vp_reduce_and: - case Intrinsic::vp_reduce_umin: - return ConstantInt::getAllOnesValue(EltTy); - case Intrinsic::vp_reduce_smin: - return ConstantInt::get(EltTy->getContext(), - APInt::getSignedMaxValue(EltBits)); - case Intrinsic::vp_reduce_smax: - return ConstantInt::get(EltTy->getContext(), - APInt::getSignedMinValue(EltBits)); - case Intrinsic::vp_reduce_fmax: - case Intrinsic::vp_reduce_fmaximum: - Negative = true; - [[fallthrough]]; - case Intrinsic::vp_reduce_fmin: - case Intrinsic::vp_reduce_fminimum: { - bool PropagatesNaN = VID == Intrinsic::vp_reduce_fminimum || - VID == Intrinsic::vp_reduce_fmaximum; - FastMathFlags Flags = VPI.getFastMathFlags(); - const fltSemantics &Semantics = EltTy->getFltSemantics(); - return (!Flags.noNaNs() && !PropagatesNaN) - ? ConstantFP::getQNaN(EltTy, Negative) - : !Flags.noInfs() - ? ConstantFP::getInfinity(EltTy, Negative) - : ConstantFP::get(EltTy, - APFloat::getLargest(Semantics, Negative)); - } - case Intrinsic::vp_reduce_fadd: - return ConstantExpr::getBinOpIdentity( - Instruction::FAdd, EltTy, false, - VPI.getFastMathFlags().noSignedZeros()); - case Intrinsic::vp_reduce_fmul: - return ConstantFP::get(EltTy, 1.0); - } + Intrinsic::ID RdxID = *VPI.getFunctionalIntrinsicID(); + FastMathFlags FMF; + if (isa(VPI)) + FMF = VPI.getFastMathFlags(); + return getReductionIdentity(RdxID, EltTy, FMF); } Value * diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 559129442a041..9a4289e1a30da 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1207,14 +1207,62 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select"); } +Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty, + FastMathFlags Flags) { + bool Negative = false; + switch (RdxID) { + default: + llvm_unreachable("Expecting a reduction intrinsic"); + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: { + unsigned Opc = getArithmeticReductionInstruction(RdxID); + return ConstantExpr::getBinOpIdentity(Opc, Ty, false, + Flags.noSignedZeros()); + } + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: { + Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RdxID); + return ConstantExpr::getIntrinsicIdentity(ScalarID, Ty); + } + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmaximum: + Negative = true; + [[fallthrough]]; + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fminimum: { + bool PropagatesNaN = RdxID == Intrinsic::vector_reduce_fminimum || + RdxID == Intrinsic::vector_reduce_fmaximum; + const fltSemantics &Semantics = Ty->getFltSemantics(); + return (!Flags.noNaNs() && !PropagatesNaN) + ? ConstantFP::getQNaN(Ty, Negative) + : !Flags.noInfs() + ? ConstantFP::getInfinity(Ty, Negative) + : ConstantFP::get(Ty, APFloat::getLargest(Semantics, Negative)); + } + } +} + +Value *llvm::getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF) { + assert((!(K == RecurKind::FMin || K == RecurKind::FMax) || + (FMF.noNaNs() && FMF.noSignedZeros())) && + "nnan, nsz is expected to be set for FP min/max reduction."); + Intrinsic::ID RdxID = getReductionIntrinsicID(K); + return getReductionIdentity(RdxID, Tp, FMF); +} + Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, RecurKind RdxKind) { auto *SrcVecEltTy = cast(Src->getType())->getElementType(); auto getIdentity = [&]() { - Intrinsic::ID ID = getReductionIntrinsicID(RdxKind); - unsigned Opc = getArithmeticReductionInstruction(ID); - bool NSZ = Builder.getFastMathFlags().noSignedZeros(); - return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy, false, NSZ); + return getRecurrenceIdentity(RdxKind, SrcVecEltTy, + Builder.getFastMathFlags()); }; switch (RdxKind) { case RecurKind::Add: @@ -1249,8 +1297,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, Intrinsic::ID Id = getReductionIntrinsicID(Kind); auto *SrcTy = cast(Src->getType()); Type *SrcEltTy = SrcTy->getElementType(); - Value *Iden = - Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); + Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); Value *Ops[] = {Iden, Src}; return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 49ed733107da9..7708763aef859 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1840,8 +1840,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) Start = RdxDesc.getRecurrenceStartValue(); else - Start = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, - RdxDesc.getFastMathFlags()); + Start = llvm::getRecurrenceIdentity(Kind, ElementTy, + RdxDesc.getFastMathFlags()); if (State.VF.isVector()) Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); @@ -3010,8 +3010,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } } else { - Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), - RdxDesc.getFastMathFlags()); + Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), + RdxDesc.getFastMathFlags()); if (!ScalarPHI) { Iden = Builder.CreateVectorSplat(State.VF, Iden); diff --git a/llvm/unittests/Analysis/IVDescriptorsTest.cpp b/llvm/unittests/Analysis/IVDescriptorsTest.cpp index 32825d55ebd9a..ce9383d15f461 100644 --- a/llvm/unittests/Analysis/IVDescriptorsTest.cpp +++ b/llvm/unittests/Analysis/IVDescriptorsTest.cpp @@ -209,10 +209,6 @@ for.end: EXPECT_TRUE(IsRdxPhi); RecurKind Kind = Rdx.getRecurrenceKind(); EXPECT_EQ(Kind, RecurKind::FMin); - Type *Ty = Phi->getType(); - Value *Id = Rdx.getRecurrenceIdentity(Kind, Ty, Rdx.getFastMathFlags()); - // Identity value for FP min reduction is +Inf. - EXPECT_EQ(Id, ConstantFP::getInfinity(Ty, false /*Negative*/)); }); } @@ -261,9 +257,5 @@ for.end: EXPECT_TRUE(IsRdxPhi); RecurKind Kind = Rdx.getRecurrenceKind(); EXPECT_EQ(Kind, RecurKind::FMax); - Type *Ty = Phi->getType(); - Value *Id = Rdx.getRecurrenceIdentity(Kind, Ty, Rdx.getFastMathFlags()); - // Identity value for FP max reduction is -Inf. - EXPECT_EQ(Id, ConstantFP::getInfinity(Ty, true /*Negative*/)); }); } From 3ebd79751f2d5e1c54047409865c051daba0a21b Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Wed, 4 Sep 2024 11:29:10 -0400 Subject: [PATCH 113/425] [MLIR][ROCDL] Remove patterns for ops supported as intrinsics in the AMDGPU backend (#102971) This patch removes patterns for a few operations which allows mathToLLVM conversion to convert the operations into LLVM intrinsics instead since they are supported directly by the AMDGPU backend. --- .../Conversion/MathToROCDL/MathToROCDL.cpp | 14 ++- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 86 +++++-------------- .../Conversion/MathToROCDL/math-to-rocdl.mlir | 44 +--------- 3 files changed, 31 insertions(+), 113 deletions(-) diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp index 7de6971ba2ee7..b3b4d81e7ffa5 100644 --- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp +++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp @@ -48,18 +48,20 @@ static void populateOpPatterns(LLVMTypeConverter &converter, void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { // Handled by mathToLLVM: math::AbsIOp + // Handled by mathToLLVM: math::AbsFIOp // Handled by mathToLLVM: math::CopySignOp // Handled by mathToLLVM: math::CountLeadingZerosOp // Handled by mathToLLVM: math::CountTrailingZerosOp // Handled by mathToLLVM: math::CgPopOp + // Handled by mathToLLVM: math::ExpOp (32-bit only) // Handled by mathToLLVM: math::FmaOp + // Handled by mathToLLVM: math::LogOp (32-bit only) // FIXME: math::IPowIOp // FIXME: math::FPowIOp // Handled by mathToLLVM: math::RoundEvenOp // Handled by mathToLLVM: math::RoundOp + // Handled by mathToLLVM: math::SqrtOp // Handled by mathToLLVM: math::TruncOp - populateOpPatterns(converter, patterns, "__ocml_fabs_f32", - "__ocml_fabs_f64"); populateOpPatterns(converter, patterns, "__ocml_acos_f32", "__ocml_acos_f64"); populateOpPatterns(converter, patterns, "__ocml_acosh_f32", @@ -84,16 +86,14 @@ void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, "__ocml_cosh_f64"); populateOpPatterns(converter, patterns, "__ocml_sinh_f32", "__ocml_sinh_f64"); - populateOpPatterns(converter, patterns, "__ocml_exp_f32", - "__ocml_exp_f64"); + populateOpPatterns(converter, patterns, "", "__ocml_exp_f64"); populateOpPatterns(converter, patterns, "__ocml_exp2_f32", "__ocml_exp2_f64"); populateOpPatterns(converter, patterns, "__ocml_expm1_f32", "__ocml_expm1_f64"); populateOpPatterns(converter, patterns, "__ocml_floor_f32", "__ocml_floor_f64"); - populateOpPatterns(converter, patterns, "__ocml_log_f32", - "__ocml_log_f64"); + populateOpPatterns(converter, patterns, "", "__ocml_log_f64"); populateOpPatterns(converter, patterns, "__ocml_log10_f32", "__ocml_log10_f64"); populateOpPatterns(converter, patterns, "__ocml_log1p_f32", @@ -106,8 +106,6 @@ void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, "__ocml_rsqrt_f64"); populateOpPatterns(converter, patterns, "__ocml_sin_f32", "__ocml_sin_f64"); - populateOpPatterns(converter, patterns, "__ocml_sqrt_f32", - "__ocml_sqrt_f64"); populateOpPatterns(converter, patterns, "__ocml_tanh_f32", "__ocml_tanh_f64"); populateOpPatterns(converter, patterns, "__ocml_tan_f32", diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index bf49a42a11577..b6fb08522ae1f 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -131,21 +131,6 @@ gpu.module @test_module { // ----- -gpu.module @test_module { - // CHECK: llvm.func @__ocml_fabs_f32(f32) -> f32 - // CHECK: llvm.func @__ocml_fabs_f64(f64) -> f64 - // CHECK-LABEL: func @gpu_fabs - func.func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.absf %arg_f32 : f32 - // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.absf %arg_f64 : f64 - // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 - } -} - -// ----- - gpu.module @test_module { // CHECK: llvm.func @__ocml_cbrt_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cbrt_f64(f64) -> f64 @@ -207,17 +192,12 @@ gpu.module @test_module { // ----- gpu.module @test_module { - // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32 // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 // CHECK-LABEL: func @gpu_exp - func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %exp_f32 = math.exp %arg_f32 : f32 - // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 - %result32 = math.exp %exp_f32 : f32 - // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + func.func @gpu_exp(%arg_f64 : f64) -> (f64) { %result64 = math.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result64 : f64 } } @@ -239,21 +219,20 @@ gpu.module @test_module { } // ----- - // Test that we handled properly operation with SymbolTable other than module op gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope - // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32 - // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 - // CHECK-LABEL: func @gpu_exp - func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %exp_f32 = math.exp %arg_f32 : f32 - // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 - %result32 = math.exp %exp_f32 : f32 - // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.exp %arg_f64 : f64 - // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 + // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64 + // CHECK-LABEL: func @gpu_sin + func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %sin_f32 = math.sin %arg_f32 : f32 + // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 + %result32 = math.sin %sin_f32 : f32 + // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.sin %arg_f64 : f64 + // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64 func.return %result32, %result64 : f32, f64 } "test.finish" () : () -> () @@ -280,15 +259,12 @@ gpu.module @test_module { // ----- gpu.module @test_module { - // CHECK: llvm.func @__ocml_log_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log - func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.log %arg_f32 : f32 - // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (f32) -> f32 + func.func @gpu_log(%arg_f64 : f64) -> (f64) { %result64 = math.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result64 : f64 } } @@ -359,26 +335,6 @@ gpu.module @test_module { // ----- -gpu.module @test_module { - // CHECK: llvm.func @__ocml_sqrt_f32(f32) -> f32 - // CHECK: llvm.func @__ocml_sqrt_f64(f64) -> f64 - // CHECK-LABEL: func @gpu_sqrt - func.func @gpu_sqrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) - -> (f16, f32, f64) { - %result16 = math.sqrt %arg_f16 : f16 - // CHECK: llvm.fpext %{{.*}} : f16 to f32 - // CHECK-NEXT: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32 - // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to f16 - %result32 = math.sqrt %arg_f32 : f32 - // CHECK: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.sqrt %arg_f64 : f64 - // CHECK: llvm.call @__ocml_sqrt_f64(%{{.*}}) : (f64) -> f64 - func.return %result16, %result32, %result64 : f16, f32, f64 - } -} - -// ----- - gpu.module @test_module { // CHECK: llvm.func @__ocml_tan_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tan_f64(f64) -> f64 @@ -472,15 +428,15 @@ gpu.module @test_module { gpu.module @test_module { // CHECK-LABEL: func @gpu_unroll func.func @gpu_unroll(%arg0 : vector<4xf32>) -> vector<4xf32> { - %result = math.exp %arg0 : vector<4xf32> + %result = math.sin %arg0 : vector<4xf32> // CHECK: %[[V0:.+]] = llvm.mlir.undef : vector<4xf32> - // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 // CHECK: %[[V1:.+]] = llvm.insertelement %[[CL]], %[[V0]] - // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 // CHECK: %[[V2:.+]] = llvm.insertelement %[[CL]], %[[V1]] - // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 // CHECK: %[[V3:.+]] = llvm.insertelement %[[CL]], %[[V2]] - // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 // CHECK: %[[V4:.+]] = llvm.insertelement %[[CL]], %[[V3]] // CHECK: return %[[V4]] func.return %result : vector<4xf32> @@ -526,9 +482,9 @@ gpu.module @test_module { gpu.module @module { // CHECK-LABEL: @spirv_exp -// CHECK: llvm.call @__ocml_exp_f32 +// CHECK: llvm.call @__ocml_sin_f32 spirv.func @spirv_exp(%arg0: vector<4xf32>) -> vector<4xf32> "None" { - %0 = math.exp %arg0 : vector<4xf32> + %0 = math.sin %arg0 : vector<4xf32> spirv.ReturnValue %0 : vector<4xf32> } } diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir index a406ec45a7f10..19d89e03a7f48 100644 --- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir +++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir @@ -15,21 +15,6 @@ module @test_module { // ----- -module @test_module { - // CHECK: llvm.func @__ocml_fabs_f32(f32) -> f32 - // CHECK: llvm.func @__ocml_fabs_f64(f64) -> f64 - // CHECK-LABEL: func @math_absf - func.func @math_absf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.absf %arg_f32 : f32 - // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.absf %arg_f64 : f64 - // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 - } -} - -// ----- - module @test_module { // CHECK: llvm.func @__ocml_acos_f32(f32) -> f32 // CHECK: llvm.func @__ocml_acos_f64(f64) -> f64 @@ -211,15 +196,12 @@ module @test_module { // ----- module @test_module { - // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32 // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 // CHECK-LABEL: func @math_exp - func.func @math_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.exp %arg_f32 : f32 - // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + func.func @math_exp(%arg_f64 : f64) -> (f64) { %result64 = math.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result64 : f64 } } @@ -271,15 +253,12 @@ module @test_module { // ----- module @test_module { - // CHECK: llvm.func @__ocml_log_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 // CHECK-LABEL: func @math_log - func.func @math_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.log %arg_f32 : f32 - // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (f32) -> f32 + func.func @math_log(%arg_f64 : f64) -> (f64) { %result64 = math.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result64 : f64 } } @@ -360,21 +339,6 @@ module @test_module { // ----- -module @test_module { - // CHECK: llvm.func @__ocml_sqrt_f32(f32) -> f32 - // CHECK: llvm.func @__ocml_sqrt_f64(f64) -> f64 - // CHECK-LABEL: func @math_sqrt - func.func @math_sqrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = math.sqrt %arg_f32 : f32 - // CHECK: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.sqrt %arg_f64 : f64 - // CHECK: llvm.call @__ocml_sqrt_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 - } -} - -// ----- - module @test_module { // CHECK: llvm.func @__ocml_tanh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tanh_f64(f64) -> f64 From fe454b2044aba1d808cec486a8ca7a0e202d31bf Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 4 Sep 2024 15:36:21 +0000 Subject: [PATCH 114/425] [gn build] Port c1a8283fcc73 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index f49c964b4128f..39ee220ee3a72 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -608,6 +608,7 @@ if (current_toolchain == default_toolchain) { "__memory/construct_at.h", "__memory/destruct_n.h", "__memory/inout_ptr.h", + "__memory/noexcept_move_assign_container.h", "__memory/out_ptr.h", "__memory/pointer_traits.h", "__memory/ranges_construct_at.h", @@ -895,7 +896,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/maybe_const.h", "__type_traits/nat.h", "__type_traits/negation.h", - "__type_traits/noexcept_move_assign_container.h", "__type_traits/promote.h", "__type_traits/rank.h", "__type_traits/remove_all_extents.h", From c81b43074ab010d01ad794224dd9dd22bbe8a1f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 4 Sep 2024 08:43:13 -0700 Subject: [PATCH 115/425] [flang][cuda] Fix lowering of cuf kernel with unstructured nested construct (#107149) Lowering was crashing when cuf kernels has an unstructured construct. Blocks created by PFT need to be re-created inside of the operation like it is done for OpenACC construct. --- flang/lib/Lower/Bridge.cpp | 8 +++++++- .../Lower/CUDA/cuda-kernel-loop-directive.cuf | 20 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index e5ccf659c3f8e..1f2724290b885 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "flang/Lower/Bridge.h" +#include "DirectivesCommon.h" #include "flang/Common/Version.h" #include "flang/Lower/Allocatable.h" #include "flang/Lower/CallInterface.h" @@ -2999,6 +3000,12 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Block &b = op.getRegion().back(); builder->setInsertionPointToStart(&b); + Fortran::lower::pft::Evaluation *crtEval = &getEval(); + if (crtEval->lowerAsUnstructured()) + Fortran::lower::createEmptyRegionBlocks( + *builder, crtEval->getNestedEvaluations()); + builder->setInsertionPointToStart(&b); + for (auto [arg, value] : llvm::zip( op.getLoopRegions().front()->front().getArguments(), ivValues)) { mlir::Value convArg = @@ -3006,7 +3013,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->create(loc, convArg, value); } - Fortran::lower::pft::Evaluation *crtEval = &getEval(); if (crtEval->lowerAsStructured()) { crtEval = &crtEval->getFirstNestedEvaluation(); for (int64_t i = 1; i < nestedLoops; i++) diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf index ba5d390df4785..aac569b6eb35b 100644 --- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf +++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf @@ -78,3 +78,23 @@ end ! CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]]#0 : !fir.ref ! CHECK: %[[STREAM_I32:.*]] = fir.convert %[[STREAM_LOAD]] : (i64) -> i32 ! CHECK: cuf.kernel<<<*, *, stream = %[[STREAM_I32]]>>> + + +! Test lowering with unstructured construct inside. +subroutine sub2(m,a,b) + integer :: m + real, device :: a(m,m), b(m) + integer :: i,j + !$cuf kernel do<<<*,*>>> + + do j = 1, m + i = 1 + do while (a(i,j).eq.0) + i = i + 1 + end do + b(j) = i + end do +end subroutine + +! CHECK-LABEL: func.func @_QPsub2 +! CHECK: cuf.kernel From 0367305af849da7ee9237fd83c04ed3a01e8d223 Mon Sep 17 00:00:00 2001 From: mzukovec <113346157+mzukovec@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:50:10 +0200 Subject: [PATCH 116/425] [lld][WebAssembly] Add allow-multiple-definition flag (#97699) Add `allow-multiple-definition` flag to `wasm-ld`. This follows the ELF linker logic. In case of duplication, the first symbol met is used. This PR resolves the #97543 --- .../wasm/Inputs/allow-multiple-definition.s | 6 +++ lld/test/wasm/allow-multiple-definition.s | 38 +++++++++++++++++++ lld/wasm/Config.h | 5 +++ lld/wasm/Driver.cpp | 24 +++++++++++- lld/wasm/Options.td | 7 ++++ lld/wasm/SymbolTable.cpp | 9 +++-- 6 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 lld/test/wasm/Inputs/allow-multiple-definition.s create mode 100644 lld/test/wasm/allow-multiple-definition.s diff --git a/lld/test/wasm/Inputs/allow-multiple-definition.s b/lld/test/wasm/Inputs/allow-multiple-definition.s new file mode 100644 index 0000000000000..7a5577cb12791 --- /dev/null +++ b/lld/test/wasm/Inputs/allow-multiple-definition.s @@ -0,0 +1,6 @@ + .hidden foo + .globl foo +foo: + .functype foo () -> (i32) + i32.const 1 + end_function diff --git a/lld/test/wasm/allow-multiple-definition.s b/lld/test/wasm/allow-multiple-definition.s new file mode 100644 index 0000000000000..93fccd3c46f90 --- /dev/null +++ b/lld/test/wasm/allow-multiple-definition.s @@ -0,0 +1,38 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t1 +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/allow-multiple-definition.s -o %t2 +# RUN: not wasm-ld %t1 %t2 -o /dev/null +# RUN: not wasm-ld --allow-multiple-definition --no-allow-multiple-definition %t1 %t2 -o /dev/null +# RUN: wasm-ld --allow-multiple-definition --fatal-warnings %t1 %t2 -o %t3 +# RUN: wasm-ld --allow-multiple-definition --fatal-warnings %t2 %t1 -o %t4 +# RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s +# RUN: llvm-objdump --no-print-imm-hex -d %t4 | FileCheck --check-prefix=REVERT %s + +# RUN: wasm-ld --noinhibit-exec %t2 %t1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN +# WARN: warning: duplicate symbol: foo + +# RUN: wasm-ld -z muldefs --fatal-warnings %t1 %t2 -o %t3 +# RUN: wasm-ld -z muldefs --fatal-warnings %t2 %t1 -o %t4 +# RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s +# RUN: llvm-objdump --no-print-imm-hex -d %t4 | FileCheck --check-prefix=REVERT %s + +# CHECK: i32.const 0 +# REVERT: i32.const 1 + +# inputs contain different constants for function foo return. +# Tests below checks that order of files in command line +# affects on what symbol will be used. +# If flag allow-multiple-definition is enabled the first +# meet symbol should be used. + + .hidden foo + .globl foo +foo: + .functype foo () -> (i32) + i32.const 0 + end_function + + .globl _start +_start: + .functype _start () -> (i32) + call foo + end_function diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 915c53c437172..05a547ff9278a 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -12,6 +12,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/Wasm.h" #include "llvm/Support/CachePruning.h" #include @@ -43,6 +44,7 @@ enum class BuildIdKind { None, Fast, Sha1, Hexstring, Uuid }; // and such fields have the same name as the corresponding options. // Most fields are initialized by the driver. struct Configuration { + bool allowMultipleDefinition; bool bsymbolic; bool checkFeatures; bool compressRelocations; @@ -64,6 +66,7 @@ struct Configuration { bool importUndefined; std::optional is64; bool mergeDataSegments; + bool noinhibitExec; bool pie; bool printGcSections; bool relocatable; @@ -148,6 +151,8 @@ struct Ctx { extern Ctx ctx; +void errorOrWarn(const llvm::Twine &msg); + } // namespace lld::wasm #endif diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 5368fe79b7eb8..cb8fe2534f5fe 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -47,6 +47,13 @@ namespace lld::wasm { Configuration *config; Ctx ctx; +void errorOrWarn(const llvm::Twine &msg) { + if (config->noinhibitExec) + warn(msg); + else + error(msg); +} + void Ctx::reset() { objectFiles.clear(); stubFiles.clear(); @@ -99,6 +106,16 @@ class LinkerDriver { std::vector files; }; + +static bool hasZOption(opt::InputArgList &args, StringRef key) { + bool ret = false; + for (const auto *arg : args.filtered(OPT_z)) + if (key == arg->getValue()) { + ret = true; + arg->claim(); + } + return ret; +} } // anonymous namespace bool link(ArrayRef args, llvm::raw_ostream &stdoutOS, @@ -467,6 +484,10 @@ getBuildId(opt::InputArgList &args) { // Initializes Config members by the command line options. static void readConfigs(opt::InputArgList &args) { + config->allowMultipleDefinition = + hasZOption(args, "muldefs") || + args.hasFlag(OPT_allow_multiple_definition, + OPT_no_allow_multiple_definition, false); config->bsymbolic = args.hasArg(OPT_Bsymbolic); config->checkFeatures = args.hasFlag(OPT_check_features, OPT_no_check_features, true); @@ -479,6 +500,7 @@ static void readConfigs(opt::InputArgList &args) { config->exportAll = args.hasArg(OPT_export_all); config->exportTable = args.hasArg(OPT_export_table); config->growableTable = args.hasArg(OPT_growable_table); + config->noinhibitExec = args.hasArg(OPT_noinhibit_exec); if (args.hasArg(OPT_import_memory_with_name)) { config->memoryImport = @@ -1173,7 +1195,7 @@ static void splitSections() { static bool isKnownZFlag(StringRef s) { // For now, we only support a very limited set of -z flags - return s.starts_with("stack-size="); + return s.starts_with("stack-size=") || s.starts_with("muldefs"); } // Report a warning for an unknown -z option. diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 3a70ee65f7c4f..c5febd145a54f 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -42,6 +42,10 @@ def Bdynamic: F<"Bdynamic">, HelpText<"Link against shared libraries">; def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries (default)">; +defm allow_multiple_definition: B<"allow-multiple-definition", + "Allow multiple definitions", + "Do not allow multiple definitions (default)">; + def build_id: F<"build-id">, HelpText<"Alias for --build-id=fast">; def build_id_eq: J<"build-id=">, HelpText<"Generate build ID note">, @@ -105,6 +109,9 @@ defm mllvm: Eq<"mllvm", "Additional arguments to forward to LLVM's option proces defm Map: Eq<"Map", "Print a link map to the specified file">; +def noinhibit_exec: F<"noinhibit-exec">, + HelpText<"Retain the executable output file whenever it is still usable">; + def o: JoinedOrSeparate<["-"], "o">, MetaVarName<"">, HelpText<"Path to file to write output">; diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp index a5d37a5eba6d5..d2216ff5a39a0 100644 --- a/lld/wasm/SymbolTable.cpp +++ b/lld/wasm/SymbolTable.cpp @@ -319,9 +319,12 @@ static bool shouldReplace(const Symbol *existing, InputFile *newFile, } // Neither symbol is week. They conflict. - error("duplicate symbol: " + toString(*existing) + "\n>>> defined in " + - toString(existing->getFile()) + "\n>>> defined in " + - toString(newFile)); + if (config->allowMultipleDefinition) + return false; + + errorOrWarn("duplicate symbol: " + toString(*existing) + "\n>>> defined in " + + toString(existing->getFile()) + "\n>>> defined in " + + toString(newFile)); return true; } From 697bc748f97736b294dd85b8f78530d023557b72 Mon Sep 17 00:00:00 2001 From: Renaud Kauffmann Date: Wed, 4 Sep 2024 08:59:55 -0700 Subject: [PATCH 117/425] Allow disabling of types from the command line (#107126) Adding hidden options to disable types through the `TargetCharacteristics`. I am seeing issues when I do this programmatically and would like, for anyone, to have the ability to reproduce them for development and testing purposes. I am planning to file a couple of issues following this patch. --- clang/include/clang/Driver/Options.td | 8 ++++++++ flang/include/flang/Frontend/TargetOptions.h | 6 ++++++ flang/include/flang/Tools/TargetSetup.h | 8 ++++++++ flang/lib/Frontend/CompilerInvocation.cpp | 17 ++++++++++++++--- flang/tools/bbc/bbc.cpp | 5 +++-- 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8fe9f4f28f8fc..1142416e227fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6762,6 +6762,14 @@ def fdefault_integer_8 : Flag<["-"],"fdefault-integer-8">, Group, HelpText<"Set the default integer and logical kind to an 8 byte wide type">; def fdefault_real_8 : Flag<["-"],"fdefault-real-8">, Group, HelpText<"Set the default real kind to an 8 byte wide type">; +def fdisable_real_3 : Flag<["-"],"fdisable-real-3">, Group, + HelpText<"Disable real(KIND=3) from TargetCharacteristics">, Flags<[HelpHidden]>; +def fdisable_real_10 : Flag<["-"],"fdisable-real-10">, Group, + HelpText<"Disable real(KIND=10) from TargetCharacteristics">, Flags<[HelpHidden]>; +def fdisable_integer_2 : Flag<["-"],"fdisable-integer-2">, Group, + HelpText<"Disable integer(KIND=2) from TargetCharacteristics">, Flags<[HelpHidden]>; +def fdisable_integer_16 : Flag<["-"],"fdisable-integer-16">, Group, + HelpText<"Disable integer(KIND=16) from TargetCharacteristics">, Flags<[HelpHidden]>; def flarge_sizes : Flag<["-"],"flarge-sizes">, Group, HelpText<"Use INTEGER(KIND=8) for the result type in size-related intrinsics">; diff --git a/flang/include/flang/Frontend/TargetOptions.h b/flang/include/flang/Frontend/TargetOptions.h index fa72c77a028a1..332adcbe6b6ac 100644 --- a/flang/include/flang/Frontend/TargetOptions.h +++ b/flang/include/flang/Frontend/TargetOptions.h @@ -38,6 +38,12 @@ class TargetOptions { /// The list of target specific features to enable or disable, as written on /// the command line. std::vector featuresAsWritten; + + /// The real KINDs disabled for this target + std::vector disabledRealKinds; + + /// The integer KINDs disabled for this target + std::vector disabledIntegerKinds; }; } // end namespace Fortran::frontend diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index 238d66c9241dd..37c1e1d2ff63f 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -10,6 +10,7 @@ #define FORTRAN_TOOLS_TARGET_SETUP_H #include "flang/Evaluate/target.h" +#include "flang/Frontend/TargetOptions.h" #include "llvm/Target/TargetMachine.h" namespace Fortran::tools { @@ -17,6 +18,7 @@ namespace Fortran::tools { [[maybe_unused]] inline static void setUpTargetCharacteristics( Fortran::evaluate::TargetCharacteristics &targetCharacteristics, const llvm::TargetMachine &targetMachine, + const Fortran::frontend::TargetOptions &targetOptions, const std::string &compilerVersion, const std::string &compilerOptions) { const llvm::Triple &targetTriple{targetMachine.getTargetTriple()}; @@ -25,6 +27,12 @@ namespace Fortran::tools { targetCharacteristics.DisableType( Fortran::common::TypeCategory::Real, /*kind=*/10); + for (auto realKind : targetOptions.disabledRealKinds) + targetCharacteristics.DisableType(common::TypeCategory::Real, realKind); + + for (auto intKind : targetOptions.disabledIntegerKinds) + targetCharacteristics.DisableType(common::TypeCategory::Integer, intKind); + targetCharacteristics.set_compilerOptionsString(compilerOptions) .set_compilerVersionString(compilerVersion); diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 9e42fcc2e39d5..90c327546198b 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -438,8 +438,19 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { for (const llvm::opt::Arg *currentArg : args.filtered(clang::driver::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); -} + if (args.hasArg(clang::driver::options::OPT_fdisable_real_10)) + opts.disabledRealKinds.push_back(10); + + if (args.hasArg(clang::driver::options::OPT_fdisable_real_3)) + opts.disabledRealKinds.push_back(3); + + if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2)) + opts.disabledIntegerKinds.push_back(2); + + if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16)) + opts.disabledIntegerKinds.push_back(16); +} // Tweak the frontend configuration based on the frontend action static void setUpFrontendBasedOnAction(FrontendOptions &opts) { if (opts.programAction == DebugDumpParsingLog) @@ -1531,8 +1542,8 @@ CompilerInvocation::getSemanticsCtx( std::string compilerVersion = Fortran::common::getFlangFullVersion(); Fortran::tools::setUpTargetCharacteristics( - semanticsContext->targetCharacteristics(), targetMachine, compilerVersion, - allCompilerInvocOpts); + semanticsContext->targetCharacteristics(), targetMachine, getTargetOpts(), + compilerVersion, allCompilerInvocOpts); return semanticsContext; } diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 736d68219581d..dcff4503f1657 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -18,6 +18,7 @@ #include "flang/Common/OpenMP-features.h" #include "flang/Common/Version.h" #include "flang/Common/default-kinds.h" +#include "flang/Frontend/TargetOptions.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/Support/Verifier.h" @@ -556,8 +557,8 @@ int main(int argc, char **argv) { std::string compilerVersion = Fortran::common::getFlangToolFullVersion("bbc"); std::string compilerOptions = ""; Fortran::tools::setUpTargetCharacteristics( - semanticsContext.targetCharacteristics(), *targetMachine, compilerVersion, - compilerOptions); + semanticsContext.targetCharacteristics(), *targetMachine, {}, + compilerVersion, compilerOptions); return mlir::failed( convertFortranSourceToMLIR(inputFilename, options, programPrefix, From 776495987272294de6aafbe73dab3e9ab445227a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 4 Sep 2024 12:03:44 -0400 Subject: [PATCH 118/425] [gn/mac] bump HOST_LINK_VERSION to 520 With this, clang will pass -platform_version instead of -mmacosx_version_min to the linker. Recent versions of the linker complain that the flag is now spelled mmacos_version_min (without the x), and this supresses that warning. 520 is over 4 years old by now, so just changing this unconditionally seems fine. --- llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn index a3fb952b1112b..bc0631dc269ac 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn @@ -68,7 +68,7 @@ write_cmake_config("Config") { # FIXME: Hardcoding this isn't great, but assuming that the host ld version # has anything to do with the ld version where the built clang will run # isn't either. Probably want to make this a declare_args. - values += [ "HOST_LINK_VERSION=305" ] + values += [ "HOST_LINK_VERSION=520" ] } else { values += [ "HOST_LINK_VERSION=" ] } From c537dd9375156c2aa3cd1bfaee88af7c492359d5 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Wed, 4 Sep 2024 09:31:59 -0700 Subject: [PATCH 119/425] [MS] Put dllexported inline global initializers in a comdat (#107154) Follow-up to c19f4f8069722f6804086d4438a0254104242c46 to handle corner case of exported inline variables. Should fix #56485 --- clang/lib/CodeGen/CGDeclCXX.cpp | 43 +++++++++++++------ .../microsoft-abi-template-static-init.cpp | 5 +-- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 2f56355cff90e..8dcb5f6100619 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -586,31 +586,50 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D, PrioritizedCXXGlobalInits.size()); PrioritizedCXXGlobalInits.push_back(std::make_pair(Key, Fn)); } else if (isTemplateInstantiation(D->getTemplateSpecializationKind()) || - getContext().GetGVALinkageForVariable(D) == GVA_DiscardableODR || + !isUniqueGVALinkage(getContext().GetGVALinkageForVariable(D)) || D->hasAttr()) { + // For vague linkage globals, put the initializer into its own global_ctors + // entry with the global as a comdat key. This ensures at most one + // initializer per DSO runs during DSO dynamic initialization. + // + // For ELF platforms, this is an important code size and startup time + // optimization. For dynamic, non-hidden symbols, the weak guard variable + // remains to ensure that other DSOs do not re-initialize the global. + // + // For PE-COFF platforms, there is no guard variable, and COMDAT + // associativity is the only way to ensure vauge linkage globals are + // initialized exactly once. + // + // MachO is the only remaining platform with no comdats that doesn't + // benefit from this optimization. The rest are mainly modeled on ELF + // behavior. + // + // C++ requires that inline global variables are initialized in source + // order, but this requirement does not exist for templated entities. + // llvm.global_ctors does not guarantee initialization order, so in + // general, Clang does not fully conform to the ordering requirement. + // However, in practice, LLVM emits global_ctors in the provided order, and + // users typically don't rely on ordering between inline globals in + // different headers which are then transitively included in varying order. + // Clang's current behavior is a practical tradeoff, since dropping the + // comdat would lead to unacceptable impact on code size and startup time. + // + // FIXME: Find a solution to guarantee source-order initialization of + // inline variables. + // // C++ [basic.start.init]p2: // Definitions of explicitly specialized class template static data // members have ordered initialization. Other class template static data // members (i.e., implicitly or explicitly instantiated specializations) // have unordered initialization. // - // As a consequence, we can put them into their own llvm.global_ctors entry. - // - // If the global is externally visible, put the initializer into a COMDAT - // group with the global being initialized. On most platforms, this is a - // minor startup time optimization. In the MS C++ ABI, there are no guard - // variables, so this COMDAT key is required for correctness. - // - // SelectAny globals will be comdat-folded. Put the initializer into a - // COMDAT group associated with the global, so the initializers get folded - // too. - I = DelayedCXXInitPosition.find(D); // CXXGlobalInits.size() is the lex order number for the next deferred // VarDecl. Use it when the current VarDecl is non-deferred. Although this // lex order number is shared between current VarDecl and some following // VarDecls, their order of insertion into `llvm.global_ctors` is the same // as the lexing order and the following stable sort would preserve such // order. + I = DelayedCXXInitPosition.find(D); unsigned LexOrder = I == DelayedCXXInitPosition.end() ? CXXGlobalInits.size() : I->second; AddGlobalCtor(Fn, 65535, LexOrder, COMDATKey); diff --git a/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp b/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp index 60b48abca2f89..871551240debf 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp @@ -49,8 +49,6 @@ struct X { static T ioo; static T init(); }; -// template specialized static data don't need in llvm.used, -// the static init routine get call from _GLOBAL__sub_I_ routines. template <> int X::ioo = X::init(); template struct X; class a { @@ -87,5 +85,6 @@ struct S1 int foo(); inline int zoo = foo(); inline static int boo = foo(); +inline __declspec(dllexport) A exported_inline{}; -// CHECK: @llvm.used = appending global [8 x ptr] [ptr @"?x@selectany_init@@3HA", ptr @"?x1@selectany_init@@3HA", ptr @"?x@?$A@H@explicit_template_instantiation@@2HA", ptr @"?ioo@?$X_@H@@2HA", ptr @"?aoo@S1@@2UA@@A", ptr @"?zoo@@3HA", ptr @"?s@?$ExportedTemplate@H@@2US@@A", ptr @"?x@?$A@H@implicit_template_instantiation@@2HA"], section "llvm.metadata" +// CHECK: @llvm.used = appending global [10 x ptr] [ptr @"?x@selectany_init@@3HA", ptr @"?x1@selectany_init@@3HA", ptr @"?x@?$A@H@explicit_template_instantiation@@2HA", ptr @"?ioo@?$X_@H@@2HA", ptr @"?ioo@?$X@H@@2HA", ptr @"?aoo@S1@@2UA@@A", ptr @"?zoo@@3HA", ptr @"?exported_inline@@3UA@@A", ptr @"?s@?$ExportedTemplate@H@@2US@@A", ptr @"?x@?$A@H@implicit_template_instantiation@@2HA"], section "llvm.metadata" From 7e03753539baaaa7a5cc29da3c0dc4d2f6df3b58 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Wed, 4 Sep 2024 12:33:05 -0400 Subject: [PATCH 120/425] Disallow btf_type_tag in C++ mode (#107238) This was always intended to be disallowed in C++ (see the definition in Attr.td), but failed to add the correct checking code in SemaType.cpp to ensure it was rejected. Fixes #106864 --- clang/docs/ReleaseNotes.rst | 5 +++++ clang/lib/Sema/SemaType.cpp | 9 +++++++++ clang/test/Sema/attr-btf_type_tag.cpp | 11 +++++++++++ clang/test/SemaCXX/sugar-common-types.cpp | 7 ------- clang/test/SemaCXX/type-traits.cpp | 1 - 5 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 clang/test/Sema/attr-btf_type_tag.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bcdbc5b702765..1520f7a2916aa 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -225,6 +225,11 @@ Attribute Changes in Clang more cases where the returned reference outlives the object. (#GH100567) +- Clang now correctly diagnoses the use of ``btf_type_tag`` in C++ and ignores + it; this attribute is a C-only attribute, and caused crashes with template + instantiation by accidentally allowing it in C++ in some circumstances. + (#GH106864) + Improvements to Clang's diagnostics ----------------------------------- diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 7df8f663da26a..520dce870b7b7 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -6490,6 +6490,15 @@ static void HandleBTFTypeTagAttribute(QualType &Type, const ParsedAttr &Attr, TypeProcessingState &State) { Sema &S = State.getSema(); + // This attribute is only supported in C. + // FIXME: we should implement checkCommonAttributeFeatures() in SemaAttr.cpp + // such that it handles type attributes, and then call that from + // processTypeAttrs() instead of one-off checks like this. + if (!Attr.diagnoseLangOpts(S)) { + Attr.setInvalid(); + return; + } + // Check the number of attribute arguments. if (Attr.getNumArgs() != 1) { S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) diff --git a/clang/test/Sema/attr-btf_type_tag.cpp b/clang/test/Sema/attr-btf_type_tag.cpp new file mode 100644 index 0000000000000..cef78fff79b96 --- /dev/null +++ b/clang/test/Sema/attr-btf_type_tag.cpp @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsyntax-only -verify=c -x c %s + +// c-no-diagnostics + +// Ensure that we diagnose the attribute as ignored in C++ but not in C. +#ifdef __cplusplus +static_assert(__builtin_is_implicit_lifetime(int __attribute__((btf_type_tag("user"))) *)); // expected-warning {{'btf_type_tag' attribute ignored}} +#endif +int __attribute__((btf_type_tag("user"))) *ptr; // expected-warning {{'btf_type_tag' attribute ignored}} + diff --git a/clang/test/SemaCXX/sugar-common-types.cpp b/clang/test/SemaCXX/sugar-common-types.cpp index e1c7578a66b9c..39a762127811f 100644 --- a/clang/test/SemaCXX/sugar-common-types.cpp +++ b/clang/test/SemaCXX/sugar-common-types.cpp @@ -90,13 +90,6 @@ N t19 = 0 ? (__underlying_type(EnumsX::X)){} : (__underlying_type(EnumsY::Y)){}; N t20 = 0 ? (__underlying_type(EnumsX::X)){} : (__underlying_type(EnumsY::X)){}; // expected-error@-1 {{rvalue of type '__underlying_type(Enums::X)' (aka 'int')}} -using SBTF1 = SS1 [[clang::btf_type_tag("1")]]; -using SBTF2 = ::SS1 [[clang::btf_type_tag("1")]]; -using SBTF3 = ::SS1 [[clang::btf_type_tag("2")]]; - -N t21 = 0 ? (SBTF1){} : (SBTF3){}; // expected-error {{from 'SS1'}} -N t22 = 0 ? (SBTF1){} : (SBTF2){}; // expected-error {{from 'SS1 __attribute__((btf_type_tag("1")))' (aka 'SS1')}} - using QX = const SB1 *; using QY = const ::SB1 *; N t23 = 0 ? (QX){} : (QY){}; // expected-error {{rvalue of type 'const SB1 *' (aka 'const SS1 *')}} diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index bf069d9bc082c..b8a9db103782c 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -2052,7 +2052,6 @@ void is_implicit_lifetime(int n) { static_assert(__builtin_is_implicit_lifetime(float4)); static_assert(__builtin_is_implicit_lifetime(align_value_int)); static_assert(__builtin_is_implicit_lifetime(int[[clang::annotate_type("category2")]] *)); - static_assert(__builtin_is_implicit_lifetime(int __attribute__((btf_type_tag("user"))) *)); static_assert(__builtin_is_implicit_lifetime(EnforceReadOnlyPlacement)); static_assert(__builtin_is_implicit_lifetime(int __attribute__((noderef)) *)); static_assert(__builtin_is_implicit_lifetime(TypeVisibility)); From accf90e16410468a2fa1ad9d1320f33fcc4cdd79 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 4 Sep 2024 17:33:31 +0100 Subject: [PATCH 121/425] [X86] pull-conditional-binop-through-shift.ll - replace X32 check prefixes with X86 We tend to use X32 only for gnux32 testing --- .../pull-conditional-binop-through-shift.ll | 578 +++++++++--------- 1 file changed, 289 insertions(+), 289 deletions(-) diff --git a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll index 46e8664c5efda..4f39b8f945413 100644 --- a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 ; shift left @@ -15,18 +15,18 @@ define i32 @and_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_signbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB0_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB0_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_signbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB0_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB0_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -44,18 +44,18 @@ define i32 @and_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_nosignbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB1_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB1_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_nosignbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB1_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB1_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -74,18 +74,18 @@ define i32 @or_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_signbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB2_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB2_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_signbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB2_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB2_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -103,18 +103,18 @@ define i32 @or_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_nosignbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB3_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB3_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_nosignbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB3_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB3_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -133,18 +133,18 @@ define i32 @xor_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_signbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB4_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB4_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_signbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB4_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB4_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -162,18 +162,18 @@ define i32 @xor_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_nosignbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB5_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $16711680, %eax # imm = 0xFF0000 -; X32-NEXT: .LBB5_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_nosignbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB5_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $16711680, %eax # imm = 0xFF0000 +; X86-NEXT: .LBB5_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -192,18 +192,18 @@ define i32 @add_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_signbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB6_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB6_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_signbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB6_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB6_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -221,18 +221,18 @@ define i32 @add_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_nosignbit_select_shl: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB7_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB7_2: -; X32-NEXT: shll $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_nosignbit_select_shl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB7_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB7_2: +; X86-NEXT: shll $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = shl i32 %t1, 8 @@ -253,18 +253,18 @@ define i32 @and_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_signbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB8_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB8_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_signbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB8_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB8_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -282,18 +282,18 @@ define i32 @and_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_nosignbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB9_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB9_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_nosignbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB9_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB9_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -312,18 +312,18 @@ define i32 @or_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_signbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB10_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB10_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_signbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB10_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB10_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -341,18 +341,18 @@ define i32 @or_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_nosignbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB11_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB11_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_nosignbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB11_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB11_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -371,18 +371,18 @@ define i32 @xor_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_signbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB12_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB12_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_signbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB12_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB12_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -400,18 +400,18 @@ define i32 @xor_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_nosignbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB13_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB13_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_nosignbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB13_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB13_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -430,18 +430,18 @@ define i32 @add_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_signbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB14_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB14_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_signbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB14_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB14_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -459,18 +459,18 @@ define i32 @add_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_nosignbit_select_lshr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB15_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB15_2: -; X32-NEXT: shrl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_nosignbit_select_lshr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB15_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB15_2: +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = lshr i32 %t1, 8 @@ -491,18 +491,18 @@ define i32 @and_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_signbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB16_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB16_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_signbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB16_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB16_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -520,18 +520,18 @@ define i32 @and_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: and_nosignbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB17_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB17_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: and_nosignbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB17_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB17_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = and i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -550,18 +550,18 @@ define i32 @or_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_signbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB18_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB18_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_signbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB18_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB18_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -579,18 +579,18 @@ define i32 @or_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: or_nosignbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB19_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: orl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB19_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: or_nosignbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB19_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: orl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB19_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = or i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -609,18 +609,18 @@ define i32 @xor_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_signbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB20_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB20_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_signbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB20_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB20_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -638,18 +638,18 @@ define i32 @xor_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: xor_nosignbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB21_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB21_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: xor_nosignbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB21_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB21_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -668,18 +668,18 @@ define i32 @add_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_signbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB22_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 -; X32-NEXT: .LBB22_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_signbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB22_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: .LBB22_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 4294901760 ; 0xFFFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 @@ -697,18 +697,18 @@ define i32 @add_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq ; -; X32-LABEL: add_nosignbit_select_ashr: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB23_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 -; X32-NEXT: .LBB23_2: -; X32-NEXT: sarl $8, %eax -; X32-NEXT: movl %eax, (%ecx) -; X32-NEXT: retl +; X86-LABEL: add_nosignbit_select_ashr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 +; X86-NEXT: .LBB23_2: +; X86-NEXT: sarl $8, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl %t0 = add i32 %x, 2147418112 ; 0x7FFF0000 %t1 = select i1 %cond, i32 %t0, i32 %x %r = ashr i32 %t1, 8 From b2223b4d7efa4ed003a1b3ce7439106ddc63125f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 4 Sep 2024 09:52:42 -0700 Subject: [PATCH 122/425] [WebAssembly] Rename legacy EH mir tests (#107189) We added `-legacy` suffix to the legacy EH `ll` tests in #107166 but forgot to do the same for `mir` tests. --- .../{cfg-stackify-eh.mir => cfg-stackify-eh-legacy.mir} | 0 .../CodeGen/WebAssembly/{exception.mir => exception-legacy.mir} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.mir => cfg-stackify-eh-legacy.mir} (100%) rename llvm/test/CodeGen/WebAssembly/{exception.mir => exception-legacy.mir} (100%) diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.mir b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir similarity index 100% rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.mir rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir diff --git a/llvm/test/CodeGen/WebAssembly/exception.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir similarity index 100% rename from llvm/test/CodeGen/WebAssembly/exception.mir rename to llvm/test/CodeGen/WebAssembly/exception-legacy.mir From 32bc670609fe9c938bca5b3c0e70e6b3934b4641 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 4 Sep 2024 09:53:38 -0700 Subject: [PATCH 123/425] [WebAssembly] Misc. fixes in CFGStackify (NFC) (#107182) This contains misc. small fixes in CFGStackify. Most of them are comment fixes and variable name changes. Two code changes are removing the cases that can never occur. Another is extracting a routine as a lambda function. I will add explanations inline in the code as Github comments. --- .../WebAssembly/WebAssemblyCFGStackify.cpp | 141 +++++++++--------- 1 file changed, 67 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index c7001ef2b33e6..6fd882f62f3f0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -63,8 +63,9 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { // over scoped regions when walking blocks. SmallVector ScopeTops; void updateScopeTops(MachineBasicBlock *Begin, MachineBasicBlock *End) { + int BeginNo = Begin->getNumber(); int EndNo = End->getNumber(); - if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > Begin->getNumber()) + if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > BeginNo) ScopeTops[EndNo] = Begin; } @@ -77,8 +78,8 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { // Exception handling related functions bool fixCallUnwindMismatches(MachineFunction &MF); bool fixCatchUnwindMismatches(MachineFunction &MF); - void addTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd, - MachineBasicBlock *DelegateDest); + void addNestedTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd, + MachineBasicBlock *UnwindDest); void recalculateScopeTops(MachineFunction &MF); void removeUnnecessaryInstrs(MachineFunction &MF); @@ -225,7 +226,7 @@ void WebAssemblyCFGStackify::registerScope(MachineInstr *Begin, EndToBegin[End] = Begin; } -// When 'End' is not an 'end_try' but 'delegate, EHPad is nullptr. +// When 'End' is not an 'end_try' but a 'delegate', EHPad is nullptr. void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin, MachineInstr *End, MachineBasicBlock *EHPad) { @@ -293,7 +294,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { } } - // Decide where in Header to put the BLOCK. + // Decide where in MBB to put the BLOCK. // Instructions that should go before the BLOCK. SmallPtrSet BeforeSet; @@ -359,21 +360,20 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { TII.get(WebAssembly::BLOCK)) .addImm(int64_t(ReturnType)); - // Decide where in Header to put the END_BLOCK. + // Decide where in MBB to put the END_BLOCK. BeforeSet.clear(); AfterSet.clear(); for (auto &MI : MBB) { #ifndef NDEBUG - // END_BLOCK should precede existing LOOP and TRY markers. - if (MI.getOpcode() == WebAssembly::LOOP || - MI.getOpcode() == WebAssembly::TRY) + // END_BLOCK should precede existing LOOP markers. + if (MI.getOpcode() == WebAssembly::LOOP) AfterSet.insert(&MI); #endif // If there is a previously placed END_LOOP marker and the header of the // loop is above this block's header, the END_LOOP should be placed after - // the BLOCK, because the loop contains this block. Otherwise the END_LOOP - // should be placed before the BLOCK. The same for END_TRY. + // the END_BLOCK, because the loop contains this block. Otherwise the + // END_LOOP should be placed before the END_BLOCK. The same for END_TRY. if (MI.getOpcode() == WebAssembly::END_LOOP || MI.getOpcode() == WebAssembly::END_TRY) { if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber()) @@ -437,7 +437,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { TII.get(WebAssembly::LOOP)) .addImm(int64_t(WebAssembly::BlockType::Void)); - // Decide where in Header to put the END_LOOP. + // Decide where in MBB to put the END_LOOP. BeforeSet.clear(); AfterSet.clear(); #ifndef NDEBUG @@ -491,7 +491,6 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { WebAssemblyException *WE = WEI.getExceptionFor(&MBB); assert(WE); MachineBasicBlock *Bottom = SRI.getBottom(WE); - auto Iter = std::next(Bottom->getIterator()); if (Iter == MF.end()) { getAppendixBlock(MF); @@ -499,12 +498,9 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { } MachineBasicBlock *Cont = &*Iter; - assert(Cont != &MF.front()); - MachineBasicBlock *LayoutPred = Cont->getPrevNode(); - // If the nearest common dominator is inside a more deeply nested context, // walk out to the nearest scope which isn't more deeply nested. - for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) { + for (MachineFunction::iterator I(Bottom), E(Header); I != E; --I) { if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { if (ScopeTop->getNumber() > Header->getNumber()) { // Skip over an intervening scope. @@ -538,7 +534,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { } // All previously inserted BLOCK/TRY markers should be after the TRY because - // they are all nested trys. + // they are all nested blocks/trys. if (MI.getOpcode() == WebAssembly::BLOCK || MI.getOpcode() == WebAssembly::TRY) AfterSet.insert(&MI); @@ -607,14 +603,13 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { TII.get(WebAssembly::TRY)) .addImm(int64_t(WebAssembly::BlockType::Void)); - // Decide where in Header to put the END_TRY. + // Decide where in Cont to put the END_TRY. BeforeSet.clear(); AfterSet.clear(); for (const auto &MI : *Cont) { #ifndef NDEBUG - // END_TRY should precede existing LOOP and BLOCK markers. - if (MI.getOpcode() == WebAssembly::LOOP || - MI.getOpcode() == WebAssembly::BLOCK) + // END_TRY should precede existing LOOP markers. + if (MI.getOpcode() == WebAssembly::LOOP) AfterSet.insert(&MI); // All END_TRY markers placed earlier belong to exceptions that contains @@ -643,9 +638,8 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // Mark the end of the TRY. InsertPos = getEarliestInsertPos(Cont, BeforeSet, AfterSet); - MachineInstr *End = - BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(), - TII.get(WebAssembly::END_TRY)); + MachineInstr *End = BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(), + TII.get(WebAssembly::END_TRY)); registerTryScope(Begin, End, &MBB); // Track the farthest-spanning scope that ends at this point. We create two @@ -845,9 +839,9 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, // Wrap the given range of instruction with try-delegate. RangeBegin and // RangeEnd are inclusive. -void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin, - MachineInstr *RangeEnd, - MachineBasicBlock *DelegateDest) { +void WebAssemblyCFGStackify::addNestedTryDelegate( + MachineInstr *RangeBegin, MachineInstr *RangeEnd, + MachineBasicBlock *UnwindDest) { auto *BeginBB = RangeBegin->getParent(); auto *EndBB = RangeEnd->getParent(); MachineFunction &MF = *BeginBB->getParent(); @@ -879,8 +873,8 @@ void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin, MachineBasicBlock *DelegateBB = MF.CreateMachineBasicBlock(); // If the destination of 'delegate' is not the caller, adds the destination to // the BB's successors. - if (DelegateDest != FakeCallerBB) - DelegateBB->addSuccessor(DelegateDest); + if (UnwindDest != FakeCallerBB) + DelegateBB->addSuccessor(UnwindDest); auto SplitPos = std::next(RangeEnd->getIterator()); if (SplitPos == EndBB->end()) { @@ -962,7 +956,7 @@ void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin, // Add 'delegate' instruction in the delegate BB created above. MachineInstr *Delegate = BuildMI(DelegateBB, RangeEnd->getDebugLoc(), TII.get(WebAssembly::DELEGATE)) - .addMBB(DelegateDest); + .addMBB(UnwindDest); registerTryScope(Try, Delegate, nullptr); } @@ -1130,7 +1124,7 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) { if (EHPadStack.back() == UnwindDest) continue; - // Include EH_LABELs in the range before and afer the invoke + // Include EH_LABELs in the range before and after the invoke MachineInstr *RangeBegin = &MI, *RangeEnd = &MI; if (RangeBegin->getIterator() != MBB.begin() && std::prev(RangeBegin->getIterator())->isEHLabel()) @@ -1231,22 +1225,24 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) { std::tie(RangeBegin, RangeEnd) = Range; auto *MBB = RangeBegin->getParent(); - // If this BB has an EH pad successor, i.e., ends with an 'invoke', now we - // are going to wrap the invoke with try-delegate, making the 'delegate' - // BB the new successor instead, so remove the EH pad succesor here. The - // BB may not have an EH pad successor if calls in this BB throw to the - // caller. - MachineBasicBlock *EHPad = nullptr; - for (auto *Succ : MBB->successors()) { - if (Succ->isEHPad()) { - EHPad = Succ; - break; + // If this BB has an EH pad successor, i.e., ends with an 'invoke', and if + // the current range contains the invoke, now we are going to wrap the + // invoke with try-delegate, making the 'delegate' BB the new successor + // instead, so remove the EH pad succesor here. The BB may not have an EH + // pad successor if calls in this BB throw to the caller. + if (UnwindDest != getFakeCallerBlock(MF)) { + MachineBasicBlock *EHPad = nullptr; + for (auto *Succ : MBB->successors()) { + if (Succ->isEHPad()) { + EHPad = Succ; + break; + } } + if (EHPad) + MBB->removeSuccessor(EHPad); } - if (EHPad) - MBB->removeSuccessor(EHPad); - addTryDelegate(RangeBegin, RangeEnd, UnwindDest); + addNestedTryDelegate(RangeBegin, RangeEnd, UnwindDest); } } @@ -1354,12 +1350,10 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) { NumCatchUnwindMismatches += EHPadToUnwindDest.size(); SmallPtrSet NewEndTryBBs; - for (auto &P : EHPadToUnwindDest) { - MachineBasicBlock *EHPad = P.first; - MachineBasicBlock *UnwindDest = P.second; + for (auto &[EHPad, UnwindDest] : EHPadToUnwindDest) { MachineInstr *Try = EHPadToTry[EHPad]; MachineInstr *EndTry = BeginToEnd[Try]; - addTryDelegate(Try, EndTry, UnwindDest); + addNestedTryDelegate(Try, EndTry, UnwindDest); NewEndTryBBs.insert(EndTry->getParent()); } @@ -1534,7 +1528,7 @@ static void appendEndToFunction(MachineFunction &MF, TII.get(WebAssembly::END_FUNCTION)); } -/// Insert LOOP/TRY/BLOCK markers at appropriate places. +/// Insert BLOCK/LOOP/TRY markers at appropriate places. void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { // We allocate one more than the number of blocks in the function to // accommodate for the possible fake block we may insert at the end. @@ -1558,9 +1552,9 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { // Fix mismatches in unwind destinations induced by linearizing the code. if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && MF.getFunction().hasPersonalityFn()) { - bool Changed = fixCallUnwindMismatches(MF); - Changed |= fixCatchUnwindMismatches(MF); - if (Changed) + bool MismatchFixed = fixCallUnwindMismatches(MF); + MismatchFixed |= fixCatchUnwindMismatches(MF); + if (MismatchFixed) recalculateScopeTops(MF); } } @@ -1654,6 +1648,23 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { // Now rewrite references to basic blocks to be depth immediates. SmallVector Stack; SmallVector EHPadStack; + + auto RewriteOperands = [&](MachineInstr &MI) { + // Rewrite MBB operands to be depth immediates. + SmallVector Ops(MI.operands()); + while (MI.getNumOperands() > 0) + MI.removeOperand(MI.getNumOperands() - 1); + for (auto MO : Ops) { + if (MO.isMBB()) { + if (MI.getOpcode() == WebAssembly::DELEGATE) + MO = MachineOperand::CreateImm(getDelegateDepth(Stack, MO.getMBB())); + else + MO = MachineOperand::CreateImm(getBranchDepth(Stack, MO.getMBB())); + } + MI.addOperand(MF, MO); + } + }; + for (auto &MBB : reverse(MF)) { for (MachineInstr &MI : llvm::reverse(MBB)) { switch (MI.getOpcode()) { @@ -1697,23 +1708,8 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { break; default: - if (MI.isTerminator()) { - // Rewrite MBB operands to be depth immediates. - SmallVector Ops(MI.operands()); - while (MI.getNumOperands() > 0) - MI.removeOperand(MI.getNumOperands() - 1); - for (auto MO : Ops) { - if (MO.isMBB()) { - if (MI.getOpcode() == WebAssembly::DELEGATE) - MO = MachineOperand::CreateImm( - getDelegateDepth(Stack, MO.getMBB())); - else - MO = MachineOperand::CreateImm( - getBranchDepth(Stack, MO.getMBB())); - } - MI.addOperand(MF, MO); - } - } + if (MI.isTerminator()) + RewriteOperands(MI); if (MI.getOpcode() == WebAssembly::DELEGATE) Stack.push_back(std::make_pair(&MBB, &MI)); @@ -1767,10 +1763,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { // Add an end instruction at the end of the function body. const auto &TII = *MF.getSubtarget().getInstrInfo(); - if (!MF.getSubtarget() - .getTargetTriple() - .isOSBinFormatELF()) - appendEndToFunction(MF, TII); + appendEndToFunction(MF, TII); cleanupFunctionData(MF); From 26ba186bd0a22fac7d08ed566b00c03236b6b7a9 Mon Sep 17 00:00:00 2001 From: RolandF77 <55763885+RolandF77@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:55:27 -0400 Subject: [PATCH 124/425] [PowerPC] Improve pwr7 codegen for v4i8 load (#104507) There are no partial vector loads on pwr7 so current v4i8 codegen is an int load then store to vector sized temp and re-load as vector. Try to use lfiwax to load 32 bits into an FP reg and take advantage of VSX FP and vector reg sharing to move the result to the right vector position. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 24 +++- .../build-vector-from-load-and-zeros.ll | 119 ++++++---------- .../PowerPC/canonical-merge-shuffles.ll | 53 +++---- llvm/test/CodeGen/PowerPC/load-and-splat.ll | 117 +++++++-------- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 21 ++- .../CodeGen/PowerPC/scalar_vector_test_4.ll | 42 +++--- .../CodeGen/PowerPC/test-vector-insert.ll | 92 +++++------- .../PowerPC/v16i8_scalar_to_vector_shuffle.ll | 28 ++-- .../PowerPC/v2i64_scalar_to_vector_shuffle.ll | 44 ++---- .../PowerPC/v4i32_scalar_to_vector_shuffle.ll | 134 +++++++----------- .../PowerPC/v8i16_scalar_to_vector_shuffle.ll | 94 +++++------- 11 files changed, 303 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8a0858e246252..f1bd14d7ee011 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11492,13 +11492,33 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Op0 = Op.getOperand(0); + ReuseLoadInfo RLI; + if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() && + Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD && + Op0.getValueType() == MVT::i32 && Op0.hasOneUse() && + canReuseLoadAddress(Op0, MVT::i32, RLI, DAG, ISD::NON_EXTLOAD)) { + + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())}; + SDValue Bits = DAG.getMemIntrinsicNode( + PPCISD::LD_SPLAT, dl, DAG.getVTList(MVT::v4i32, MVT::Other), Ops, + MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + return Bits.getValue(0); + } + // Create a stack slot that is 16-byte aligned. - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, Align(16), false); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Val = Op.getOperand(0); + SDValue Val = Op0; EVT ValVT = Val.getValueType(); // P10 hardware store forwarding requires that a single store contains all // the data for the load. P10 is able to merge a pair of adjacent stores. Try diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll index 6d35a7281de6b..fba6725e2b2a3 100644 --- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll +++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll @@ -27,20 +27,17 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) { ; PWR7-LE-LABEL: build_v2i64_extload_0: ; PWR7-LE: # %bb.0: # %entry ; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: lwz 3, 0(3) ; PWR7-LE-NEXT: stw 4, -16(1) ; PWR7-LE-NEXT: addis 4, 2, .LCPI0_0@toc@ha +; PWR7-LE-NEXT: lfiwzx 0, 0, 3 +; PWR7-LE-NEXT: addi 3, 1, -16 ; PWR7-LE-NEXT: addi 4, 4, .LCPI0_0@toc@l -; PWR7-LE-NEXT: stw 3, -32(1) -; PWR7-LE-NEXT: addi 3, 1, -32 -; PWR7-LE-NEXT: lxvd2x 0, 0, 4 -; PWR7-LE-NEXT: addi 4, 1, -16 ; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxswapd 34, 0 +; PWR7-LE-NEXT: xxspltw 35, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 35, 1 +; PWR7-LE-NEXT: xxswapd 34, 1 ; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 3, 4, 2 +; PWR7-LE-NEXT: vperm 2, 4, 3, 2 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v2i64_extload_0: @@ -337,17 +334,13 @@ entry: define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) { ; PWR7-BE-LABEL: build_v4i32_load_0: ; PWR7-BE: # %bb.0: # %entry -; PWR7-BE-NEXT: lwz 3, 0(3) -; PWR7-BE-NEXT: xxlxor 36, 36, 36 -; PWR7-BE-NEXT: sldi 3, 3, 32 -; PWR7-BE-NEXT: std 3, -32(1) -; PWR7-BE-NEXT: std 3, -24(1) +; PWR7-BE-NEXT: lfiwzx 0, 0, 3 ; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; PWR7-BE-NEXT: xxlxor 36, 36, 36 ; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0@toc@l -; PWR7-BE-NEXT: lxvw4x 34, 0, 3 -; PWR7-BE-NEXT: addi 3, 1, -32 ; PWR7-BE-NEXT: lxvw4x 35, 0, 3 -; PWR7-BE-NEXT: vperm 2, 3, 4, 2 +; PWR7-BE-NEXT: xxspltw 34, 0, 1 +; PWR7-BE-NEXT: vperm 2, 2, 4, 3 ; PWR7-BE-NEXT: blr ; ; PWR8-BE-LABEL: build_v4i32_load_0: @@ -365,20 +358,17 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) { ; PWR7-LE-LABEL: build_v4i32_load_0: ; PWR7-LE: # %bb.0: # %entry ; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: lwz 3, 0(3) ; PWR7-LE-NEXT: stw 4, -16(1) ; PWR7-LE-NEXT: addis 4, 2, .LCPI8_0@toc@ha +; PWR7-LE-NEXT: lfiwzx 0, 0, 3 +; PWR7-LE-NEXT: addi 3, 1, -16 ; PWR7-LE-NEXT: addi 4, 4, .LCPI8_0@toc@l -; PWR7-LE-NEXT: stw 3, -32(1) -; PWR7-LE-NEXT: addi 3, 1, -32 -; PWR7-LE-NEXT: lxvd2x 0, 0, 4 -; PWR7-LE-NEXT: addi 4, 1, -16 ; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxswapd 34, 0 +; PWR7-LE-NEXT: xxspltw 35, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 35, 1 +; PWR7-LE-NEXT: xxswapd 34, 1 ; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 3, 4, 2 +; PWR7-LE-NEXT: vperm 2, 4, 3, 2 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_0: @@ -400,17 +390,13 @@ entry: define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) { ; PWR7-BE-LABEL: build_v4i32_load_1: ; PWR7-BE: # %bb.0: # %entry -; PWR7-BE-NEXT: lwz 3, 0(3) -; PWR7-BE-NEXT: xxlxor 36, 36, 36 -; PWR7-BE-NEXT: sldi 3, 3, 32 -; PWR7-BE-NEXT: std 3, -16(1) -; PWR7-BE-NEXT: std 3, -8(1) +; PWR7-BE-NEXT: lfiwzx 0, 0, 3 ; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0@toc@ha +; PWR7-BE-NEXT: xxlxor 36, 36, 36 ; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0@toc@l -; PWR7-BE-NEXT: lxvw4x 34, 0, 3 -; PWR7-BE-NEXT: addi 3, 1, -16 ; PWR7-BE-NEXT: lxvw4x 35, 0, 3 -; PWR7-BE-NEXT: vperm 2, 4, 3, 2 +; PWR7-BE-NEXT: xxspltw 34, 0, 1 +; PWR7-BE-NEXT: vperm 2, 4, 2, 3 ; PWR7-BE-NEXT: blr ; ; PWR8-BE-LABEL: build_v4i32_load_1: @@ -427,20 +413,17 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) { ; PWR7-LE-LABEL: build_v4i32_load_1: ; PWR7-LE: # %bb.0: # %entry ; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: lwz 3, 0(3) -; PWR7-LE-NEXT: stw 4, -32(1) +; PWR7-LE-NEXT: stw 4, -16(1) ; PWR7-LE-NEXT: addis 4, 2, .LCPI9_0@toc@ha -; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l -; PWR7-LE-NEXT: stw 3, -16(1) +; PWR7-LE-NEXT: lfiwzx 0, 0, 3 ; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: lxvd2x 0, 0, 4 -; PWR7-LE-NEXT: addi 4, 1, -32 +; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l ; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxswapd 34, 0 +; PWR7-LE-NEXT: xxspltw 35, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 35, 1 +; PWR7-LE-NEXT: xxswapd 34, 1 ; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 4, 3, 2 +; PWR7-LE-NEXT: vperm 2, 3, 4, 2 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_1: @@ -463,17 +446,13 @@ entry: define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) { ; PWR7-BE-LABEL: build_v4i32_load_2: ; PWR7-BE: # %bb.0: # %entry -; PWR7-BE-NEXT: lwz 3, 0(3) -; PWR7-BE-NEXT: xxlxor 36, 36, 36 -; PWR7-BE-NEXT: sldi 3, 3, 32 -; PWR7-BE-NEXT: std 3, -16(1) -; PWR7-BE-NEXT: std 3, -8(1) +; PWR7-BE-NEXT: lfiwzx 0, 0, 3 ; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0@toc@ha +; PWR7-BE-NEXT: xxlxor 36, 36, 36 ; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0@toc@l -; PWR7-BE-NEXT: lxvw4x 34, 0, 3 -; PWR7-BE-NEXT: addi 3, 1, -16 ; PWR7-BE-NEXT: lxvw4x 35, 0, 3 -; PWR7-BE-NEXT: vperm 2, 4, 3, 2 +; PWR7-BE-NEXT: xxspltw 34, 0, 1 +; PWR7-BE-NEXT: vperm 2, 4, 2, 3 ; PWR7-BE-NEXT: blr ; ; PWR8-BE-LABEL: build_v4i32_load_2: @@ -491,20 +470,17 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) { ; PWR7-LE-LABEL: build_v4i32_load_2: ; PWR7-LE: # %bb.0: # %entry ; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: lwz 3, 0(3) -; PWR7-LE-NEXT: stw 4, -32(1) +; PWR7-LE-NEXT: stw 4, -16(1) ; PWR7-LE-NEXT: addis 4, 2, .LCPI10_0@toc@ha -; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l -; PWR7-LE-NEXT: stw 3, -16(1) +; PWR7-LE-NEXT: lfiwzx 0, 0, 3 ; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: lxvd2x 0, 0, 4 -; PWR7-LE-NEXT: addi 4, 1, -32 +; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l ; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxswapd 34, 0 +; PWR7-LE-NEXT: xxspltw 35, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 35, 1 +; PWR7-LE-NEXT: xxswapd 34, 1 ; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 4, 3, 2 +; PWR7-LE-NEXT: vperm 2, 3, 4, 2 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_2: @@ -526,17 +502,13 @@ entry: define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) { ; PWR7-BE-LABEL: build_v4i32_load_3: ; PWR7-BE: # %bb.0: # %entry -; PWR7-BE-NEXT: lwz 3, 0(3) -; PWR7-BE-NEXT: xxlxor 36, 36, 36 -; PWR7-BE-NEXT: sldi 3, 3, 32 -; PWR7-BE-NEXT: std 3, -16(1) -; PWR7-BE-NEXT: std 3, -8(1) +; PWR7-BE-NEXT: lfiwzx 0, 0, 3 ; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0@toc@ha +; PWR7-BE-NEXT: xxlxor 36, 36, 36 ; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0@toc@l -; PWR7-BE-NEXT: lxvw4x 34, 0, 3 -; PWR7-BE-NEXT: addi 3, 1, -16 ; PWR7-BE-NEXT: lxvw4x 35, 0, 3 -; PWR7-BE-NEXT: vperm 2, 4, 3, 2 +; PWR7-BE-NEXT: xxspltw 34, 0, 1 +; PWR7-BE-NEXT: vperm 2, 4, 2, 3 ; PWR7-BE-NEXT: blr ; ; PWR8-BE-LABEL: build_v4i32_load_3: @@ -553,20 +525,17 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) { ; PWR7-LE-LABEL: build_v4i32_load_3: ; PWR7-LE: # %bb.0: # %entry ; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: lwz 3, 0(3) -; PWR7-LE-NEXT: stw 4, -32(1) +; PWR7-LE-NEXT: stw 4, -16(1) ; PWR7-LE-NEXT: addis 4, 2, .LCPI11_0@toc@ha -; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l -; PWR7-LE-NEXT: stw 3, -16(1) +; PWR7-LE-NEXT: lfiwzx 0, 0, 3 ; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: lxvd2x 0, 0, 4 -; PWR7-LE-NEXT: addi 4, 1, -32 +; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l ; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxswapd 34, 0 +; PWR7-LE-NEXT: xxspltw 35, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 35, 1 +; PWR7-LE-NEXT: xxswapd 34, 1 ; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 4, 3, 2 +; PWR7-LE-NEXT: vperm 2, 3, 4, 2 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_3: diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index c26f98c5b0495..e1159e56e23eb 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -536,15 +536,12 @@ define dso_local <8 x i16> @testmrglb3(ptr nocapture readonly %a) local_unnamed_ ; ; P8-AIX-32-LABEL: testmrglb3: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r4, 4(r3) +; P8-AIX-32-NEXT: li r4, 4 +; P8-AIX-32-NEXT: lfiwzx f1, 0, r3 ; P8-AIX-32-NEXT: xxlxor v3, v3, v3 -; P8-AIX-32-NEXT: stw r4, -16(r1) -; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: stw r3, -32(r1) -; P8-AIX-32-NEXT: addi r3, r1, -16 -; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -32 -; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3 +; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 +; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1 +; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1 ; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0 ; P8-AIX-32-NEXT: vmrghb v2, v3, v2 ; P8-AIX-32-NEXT: blr @@ -852,17 +849,15 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(ptr nocapture rea ; ; P8-AIX-32-LABEL: no_RAUW_in_combine_during_legalize: ; P8-AIX-32: # %bb.0: # %entry +; P8-AIX-32-NEXT: li r5, 0 ; P8-AIX-32-NEXT: slwi r4, r4, 2 ; P8-AIX-32-NEXT: xxlxor v3, v3, v3 -; P8-AIX-32-NEXT: lwzx r3, r3, r4 -; P8-AIX-32-NEXT: li r4, 0 -; P8-AIX-32-NEXT: stw r4, -32(r1) -; P8-AIX-32-NEXT: stw r3, -16(r1) -; P8-AIX-32-NEXT: addi r3, r1, -32 -; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX-32-NEXT: stw r5, -16(r1) +; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 ; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3 -; P8-AIX-32-NEXT: xxmrghw v2, vs0, vs1 +; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1 +; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0 ; P8-AIX-32-NEXT: vmrghb v2, v2, v3 ; P8-AIX-32-NEXT: blr entry: @@ -1026,14 +1021,11 @@ define dso_local <2 x i64> @testSplat8(ptr nocapture readonly %ptr) local_unname ; ; P8-AIX-32-LABEL: testSplat8: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r4, 4(r3) -; P8-AIX-32-NEXT: stw r4, -16(r1) -; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: stw r3, -32(r1) -; P8-AIX-32-NEXT: addi r3, r1, -16 -; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -32 -; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3 +; P8-AIX-32-NEXT: li r4, 4 +; P8-AIX-32-NEXT: lfiwzx f1, 0, r3 +; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 +; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1 +; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1 ; P8-AIX-32-NEXT: xxmrghw vs0, vs1, vs0 ; P8-AIX-32-NEXT: xxmrghd v2, vs0, vs0 ; P8-AIX-32-NEXT: blr @@ -1081,17 +1073,14 @@ define <2 x i64> @testSplati64_0(ptr nocapture readonly %ptr) #0 { ; ; P8-AIX-32-LABEL: testSplati64_0: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r4, 0(r3) -; P8-AIX-32-NEXT: lwz r3, 4(r3) -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: li r4, 4 +; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 +; P8-AIX-32-NEXT: xxspltw v2, vs0, 1 +; P8-AIX-32-NEXT: lfiwzx f0, 0, r3 ; P8-AIX-32-NEXT: lwz r3, L..C3(r2) # %const.0 -; P8-AIX-32-NEXT: stw r4, -32(r1) -; P8-AIX-32-NEXT: lxvw4x v2, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -32 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %0 = load <1 x i64>, ptr %ptr, align 8 diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll index bc68ad2a67bf5..c9ee3a51f4172 100644 --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -208,47 +208,41 @@ define dso_local void @test4(ptr nocapture %c, ptr nocapture readonly %a) local_ ; ; P9-AIX32-LABEL: test4: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lwz r5, 24(r4) -; P9-AIX32-NEXT: lwz r4, 28(r4) -; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: li r5, 28 +; P9-AIX32-NEXT: lxvwsx vs0, r4, r5 +; P9-AIX32-NEXT: li r5, 24 +; P9-AIX32-NEXT: lxvwsx vs1, r4, r5 ; P9-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0 -; P9-AIX32-NEXT: stw r5, -32(r1) -; P9-AIX32-NEXT: lxv vs1, -16(r1) -; P9-AIX32-NEXT: lxv vs2, -32(r1) -; P9-AIX32-NEXT: lxv vs0, 0(r4) -; P9-AIX32-NEXT: xxperm vs1, vs2, vs0 -; P9-AIX32-NEXT: stxv vs1, 0(r3) +; P9-AIX32-NEXT: lxv vs2, 0(r4) +; P9-AIX32-NEXT: xxperm vs0, vs1, vs2 +; P9-AIX32-NEXT: stxv vs0, 0(r3) ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test4: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r5, 24(r4) -; P8-AIX32-NEXT: lwz r4, 28(r4) -; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: li r5, 28 +; P8-AIX32-NEXT: lfiwzx f0, r4, r5 +; P8-AIX32-NEXT: li r5, 24 +; P8-AIX32-NEXT: xxspltw v2, vs0, 1 +; P8-AIX32-NEXT: lfiwzx f0, r4, r5 ; P8-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0 -; P8-AIX32-NEXT: stw r5, -32(r1) -; P8-AIX32-NEXT: lxvw4x v2, 0, r4 -; P8-AIX32-NEXT: addi r4, r1, -16 -; P8-AIX32-NEXT: lxvw4x v3, 0, r4 -; P8-AIX32-NEXT: addi r4, r1, -32 ; P8-AIX32-NEXT: lxvw4x v4, 0, r4 -; P8-AIX32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX32-NEXT: stxvw4x v2, 0, r3 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test4: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lwz r5, 24(r4) -; P7-AIX32-NEXT: lwz r4, 28(r4) -; P7-AIX32-NEXT: stw r4, -16(r1) +; P7-AIX32-NEXT: li r5, 28 +; P7-AIX32-NEXT: lfiwzx f0, r4, r5 +; P7-AIX32-NEXT: li r5, 24 +; P7-AIX32-NEXT: xxspltw v2, vs0, 1 +; P7-AIX32-NEXT: lfiwzx f0, r4, r5 ; P7-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0 -; P7-AIX32-NEXT: stw r5, -32(r1) -; P7-AIX32-NEXT: lxvw4x v2, 0, r4 -; P7-AIX32-NEXT: addi r4, r1, -16 -; P7-AIX32-NEXT: lxvw4x v3, 0, r4 -; P7-AIX32-NEXT: addi r4, r1, -32 ; P7-AIX32-NEXT: lxvw4x v4, 0, r4 -; P7-AIX32-NEXT: vperm v2, v4, v3, v2 +; P7-AIX32-NEXT: xxspltw v3, vs0, 1 +; P7-AIX32-NEXT: vperm v2, v3, v2, v4 ; P7-AIX32-NEXT: stxvw4x v2, 0, r3 ; P7-AIX32-NEXT: blr entry: @@ -362,47 +356,41 @@ define void @test6(ptr %a, ptr %in) { ; ; P9-AIX32-LABEL: test6: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lwz r4, 0(r4) ; P9-AIX32-NEXT: li r5, 0 -; P9-AIX32-NEXT: stw r5, -32(r1) -; P9-AIX32-NEXT: lxv vs1, -32(r1) -; P9-AIX32-NEXT: stw r4, -16(r1) -; P9-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 +; P9-AIX32-NEXT: stw r5, -16(r1) +; P9-AIX32-NEXT: lwz r5, L..C2(r2) # %const.0 +; P9-AIX32-NEXT: lxvwsx vs1, 0, r4 ; P9-AIX32-NEXT: lxv vs2, -16(r1) -; P9-AIX32-NEXT: lxv vs0, 0(r4) -; P9-AIX32-NEXT: xxperm vs2, vs1, vs0 -; P9-AIX32-NEXT: stxv vs2, 0(r3) +; P9-AIX32-NEXT: lxv vs0, 0(r5) +; P9-AIX32-NEXT: xxperm vs1, vs2, vs0 +; P9-AIX32-NEXT: stxv vs1, 0(r3) ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test6: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r4, 0(r4) ; P8-AIX32-NEXT: li r5, 0 -; P8-AIX32-NEXT: stw r5, -32(r1) -; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: stw r5, -16(r1) +; P8-AIX32-NEXT: lfiwzx f0, 0, r4 ; P8-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 -; P8-AIX32-NEXT: lxvw4x v2, 0, r4 -; P8-AIX32-NEXT: addi r4, r1, -32 ; P8-AIX32-NEXT: lxvw4x v3, 0, r4 ; P8-AIX32-NEXT: addi r4, r1, -16 ; P8-AIX32-NEXT: lxvw4x v4, 0, r4 -; P8-AIX32-NEXT: vperm v2, v3, v4, v2 +; P8-AIX32-NEXT: xxspltw v2, vs0, 1 +; P8-AIX32-NEXT: vperm v2, v4, v2, v3 ; P8-AIX32-NEXT: stxvw4x v2, 0, r3 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test6: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lwz r4, 0(r4) ; P7-AIX32-NEXT: li r5, 0 -; P7-AIX32-NEXT: stw r5, -32(r1) -; P7-AIX32-NEXT: stw r4, -16(r1) +; P7-AIX32-NEXT: stw r5, -16(r1) +; P7-AIX32-NEXT: lfiwzx f0, 0, r4 ; P7-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 -; P7-AIX32-NEXT: lxvw4x v2, 0, r4 -; P7-AIX32-NEXT: addi r4, r1, -32 ; P7-AIX32-NEXT: lxvw4x v3, 0, r4 ; P7-AIX32-NEXT: addi r4, r1, -16 ; P7-AIX32-NEXT: lxvw4x v4, 0, r4 -; P7-AIX32-NEXT: vperm v2, v3, v4, v2 +; P7-AIX32-NEXT: xxspltw v2, vs0, 1 +; P7-AIX32-NEXT: vperm v2, v4, v2, v3 ; P7-AIX32-NEXT: stxvw4x v2, 0, r3 ; P7-AIX32-NEXT: blr entry: @@ -810,40 +798,31 @@ define <16 x i8> @unadjusted_lxvdsx(ptr %s, ptr %t) { ; ; P9-AIX32-LABEL: unadjusted_lxvdsx: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lwz r4, 4(r3) -; P9-AIX32-NEXT: stw r4, -16(r1) -; P9-AIX32-NEXT: lwz r3, 0(r3) -; P9-AIX32-NEXT: lxv vs0, -16(r1) -; P9-AIX32-NEXT: stw r3, -32(r1) -; P9-AIX32-NEXT: lxv vs1, -32(r1) +; P9-AIX32-NEXT: li r4, 4 +; P9-AIX32-NEXT: lxvwsx vs1, 0, r3 +; P9-AIX32-NEXT: lxvwsx vs0, r3, r4 ; P9-AIX32-NEXT: xxmrghw vs0, vs1, vs0 ; P9-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: unadjusted_lxvdsx: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r4, 4(r3) -; P8-AIX32-NEXT: stw r4, -16(r1) -; P8-AIX32-NEXT: lwz r3, 0(r3) -; P8-AIX32-NEXT: stw r3, -32(r1) -; P8-AIX32-NEXT: addi r3, r1, -16 -; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 -; P8-AIX32-NEXT: addi r3, r1, -32 -; P8-AIX32-NEXT: lxvw4x vs1, 0, r3 +; P8-AIX32-NEXT: li r4, 4 +; P8-AIX32-NEXT: lfiwzx f1, 0, r3 +; P8-AIX32-NEXT: lfiwzx f0, r3, r4 +; P8-AIX32-NEXT: xxspltw vs1, vs1, 1 +; P8-AIX32-NEXT: xxspltw vs0, vs0, 1 ; P8-AIX32-NEXT: xxmrghw vs0, vs1, vs0 ; P8-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: unadjusted_lxvdsx: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lwz r4, 4(r3) -; P7-AIX32-NEXT: stw r4, -16(r1) -; P7-AIX32-NEXT: lwz r3, 0(r3) -; P7-AIX32-NEXT: stw r3, -32(r1) -; P7-AIX32-NEXT: addi r3, r1, -16 -; P7-AIX32-NEXT: lxvw4x vs0, 0, r3 -; P7-AIX32-NEXT: addi r3, r1, -32 -; P7-AIX32-NEXT: lxvw4x vs1, 0, r3 +; P7-AIX32-NEXT: li r4, 4 +; P7-AIX32-NEXT: lfiwzx f1, 0, r3 +; P7-AIX32-NEXT: lfiwzx f0, r3, r4 +; P7-AIX32-NEXT: xxspltw vs1, vs1, 1 +; P7-AIX32-NEXT: xxspltw vs0, vs0, 1 ; P7-AIX32-NEXT: xxmrghw vs0, vs1, vs0 ; P7-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P7-AIX32-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index 4da36c9af5c10..4435484ae0b94 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -85,23 +85,20 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) { ; ; P9BE-AIX32-LABEL: test64: ; P9BE-AIX32: # %bb.0: # %entry -; P9BE-AIX32-NEXT: lwzux 4, 3, 4 +; P9BE-AIX32-NEXT: add 5, 3, 4 +; P9BE-AIX32-NEXT: lxvwsx 0, 3, 4 +; P9BE-AIX32-NEXT: li 3, 4 ; P9BE-AIX32-NEXT: xxlxor 2, 2, 2 ; P9BE-AIX32-NEXT: vspltisw 4, 8 -; P9BE-AIX32-NEXT: stw 4, -48(1) +; P9BE-AIX32-NEXT: lxvwsx 1, 5, 3 +; P9BE-AIX32-NEXT: lwz 3, L..C0(2) # %const.0 ; P9BE-AIX32-NEXT: vadduwm 4, 4, 4 -; P9BE-AIX32-NEXT: lwz 4, 4(3) -; P9BE-AIX32-NEXT: lxv 0, -48(1) -; P9BE-AIX32-NEXT: stw 4, -32(1) -; P9BE-AIX32-NEXT: lwz 4, L..C0(2) # %const.0 -; P9BE-AIX32-NEXT: lxv 1, -32(1) -; P9BE-AIX32-NEXT: lwz 3, 8(3) -; P9BE-AIX32-NEXT: stw 3, -16(1) -; P9BE-AIX32-NEXT: lwz 3, L..C1(2) # %const.1 ; P9BE-AIX32-NEXT: xxmrghw 2, 0, 1 -; P9BE-AIX32-NEXT: lxv 0, 0(4) +; P9BE-AIX32-NEXT: lxv 0, 0(3) +; P9BE-AIX32-NEXT: li 3, 8 ; P9BE-AIX32-NEXT: xxperm 2, 2, 0 -; P9BE-AIX32-NEXT: lxv 0, -16(1) +; P9BE-AIX32-NEXT: lxvwsx 0, 5, 3 +; P9BE-AIX32-NEXT: lwz 3, L..C1(2) # %const.1 ; P9BE-AIX32-NEXT: xxmrghw 3, 1, 0 ; P9BE-AIX32-NEXT: lxv 0, 0(3) ; P9BE-AIX32-NEXT: xxperm 3, 3, 0 diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll index 25e1baa28f7ef..c8e0d0d25f4f7 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -73,13 +73,11 @@ define <4 x i32> @s2v_test1(ptr nocapture readonly %int32, <4 x i32> %vec) { ; ; P8-AIX-32-LABEL: s2v_test1: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: lfiwzx f0, 0, r3 ; P8-AIX-32-NEXT: lwz r3, L..C0(r2) # %const.0 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v2, v3 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %0 = load i32, ptr %int32, align 4 @@ -142,13 +140,12 @@ define <4 x i32> @s2v_test2(ptr nocapture readonly %int32, <4 x i32> %vec) { ; ; P8-AIX-32-LABEL: s2v_test2: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r3, 4(r3) -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: addi r3, r3, 4 +; P8-AIX-32-NEXT: lfiwzx f0, 0, r3 ; P8-AIX-32-NEXT: lwz r3, L..C1(r2) # %const.0 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v2, v3 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, ptr %int32, i64 1 @@ -224,13 +221,11 @@ define <4 x i32> @s2v_test3(ptr nocapture readonly %int32, <4 x i32> %vec, i32 s ; P8-AIX-32-LABEL: s2v_test3: ; P8-AIX-32: # %bb.0: # %entry ; P8-AIX-32-NEXT: slwi r4, r4, 2 -; P8-AIX-32-NEXT: lwzx r3, r3, r4 -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 ; P8-AIX-32-NEXT: lwz r3, L..C2(r2) # %const.0 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v2, v3 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %idxprom = sext i32 %Idx to i64 @@ -295,13 +290,12 @@ define <4 x i32> @s2v_test4(ptr nocapture readonly %int32, <4 x i32> %vec) { ; ; P8-AIX-32-LABEL: s2v_test4: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r3, 4(r3) -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: addi r3, r3, 4 +; P8-AIX-32-NEXT: lfiwzx f0, 0, r3 ; P8-AIX-32-NEXT: lwz r3, L..C3(r2) # %const.0 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v2, v3 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, ptr %int32, i64 1 @@ -362,13 +356,11 @@ define <4 x i32> @s2v_test5(<4 x i32> %vec, ptr nocapture readonly %ptr1) { ; ; P8-AIX-32-LABEL: s2v_test5: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: stw r3, -16(r1) +; P8-AIX-32-NEXT: lfiwzx f0, 0, r3 ; P8-AIX-32-NEXT: lwz r3, L..C4(r2) # %const.0 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r3 -; P8-AIX-32-NEXT: vperm v2, v4, v2, v3 +; P8-AIX-32-NEXT: xxspltw v3, vs0, 1 +; P8-AIX-32-NEXT: vperm v2, v3, v2, v4 ; P8-AIX-32-NEXT: blr entry: %0 = load i32, ptr %ptr1, align 4 diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll index 73b4ad8a507b8..47fa6f2a5b4d2 100644 --- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll +++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll @@ -25,16 +25,13 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) { ; CHECK-LE-P7: # %bb.0: # %entry ; CHECK-LE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-LE-P7-NEXT: addi r3, r1, -4 +; CHECK-LE-P7-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-LE-P7-NEXT: addi r4, r4, .LCPI0_0@toc@l ; CHECK-LE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-LE-P7-NEXT: lwz r3, -4(r1) -; CHECK-LE-P7-NEXT: stw r3, -32(r1) -; CHECK-LE-P7-NEXT: addis r3, r2, .LCPI0_0@toc@ha -; CHECK-LE-P7-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: addi r3, r1, -32 +; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P7-NEXT: xxswapd v3, vs0 -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: xxswapd v4, vs0 +; CHECK-LE-P7-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P7-NEXT: xxspltw v4, vs0, 1 ; CHECK-LE-P7-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P7-NEXT: blr ; @@ -59,16 +56,12 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) { ; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: sldi r3, r3, 32 -; CHECK-BE-P7-NEXT: std r3, -32(r1) -; CHECK-BE-P7-NEXT: std r3, -24(r1) +; CHECK-BE-P7-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI0_0@toc@ha ; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P7-NEXT: addi r3, r1, -32 ; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P7-NEXT: xxspltw v3, vs0, 1 +; CHECK-BE-P7-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test: @@ -96,16 +89,13 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) { ; CHECK-LE-P7: # %bb.0: # %entry ; CHECK-LE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-LE-P7-NEXT: addi r3, r1, -4 +; CHECK-LE-P7-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-LE-P7-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-LE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-LE-P7-NEXT: lwz r3, -4(r1) -; CHECK-LE-P7-NEXT: stw r3, -32(r1) -; CHECK-LE-P7-NEXT: addis r3, r2, .LCPI1_0@toc@ha -; CHECK-LE-P7-NEXT: addi r3, r3, .LCPI1_0@toc@l -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: addi r3, r1, -32 +; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P7-NEXT: xxswapd v3, vs0 -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: xxswapd v4, vs0 +; CHECK-LE-P7-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P7-NEXT: xxspltw v4, vs0, 1 ; CHECK-LE-P7-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P7-NEXT: blr ; @@ -130,16 +120,12 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) { ; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: sldi r3, r3, 32 -; CHECK-BE-P7-NEXT: std r3, -32(r1) -; CHECK-BE-P7-NEXT: std r3, -24(r1) +; CHECK-BE-P7-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI1_0@toc@ha ; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI1_0@toc@l -; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P7-NEXT: addi r3, r1, -32 ; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P7-NEXT: xxspltw v3, vs0, 1 +; CHECK-BE-P7-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test2: @@ -167,16 +153,13 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) { ; CHECK-LE-P7: # %bb.0: # %entry ; CHECK-LE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-LE-P7-NEXT: addi r3, r1, -4 +; CHECK-LE-P7-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-LE-P7-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-LE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-LE-P7-NEXT: lwz r3, -4(r1) -; CHECK-LE-P7-NEXT: stw r3, -32(r1) -; CHECK-LE-P7-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-LE-P7-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: addi r3, r1, -32 +; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P7-NEXT: xxswapd v3, vs0 -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: xxswapd v4, vs0 +; CHECK-LE-P7-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P7-NEXT: xxspltw v4, vs0, 1 ; CHECK-LE-P7-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P7-NEXT: blr ; @@ -201,16 +184,12 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) { ; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: sldi r3, r3, 32 -; CHECK-BE-P7-NEXT: std r3, -32(r1) -; CHECK-BE-P7-NEXT: std r3, -24(r1) +; CHECK-BE-P7-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI2_0@toc@ha ; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P7-NEXT: addi r3, r1, -32 ; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P7-NEXT: xxspltw v3, vs0, 1 +; CHECK-BE-P7-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test3: @@ -238,16 +217,13 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) { ; CHECK-LE-P7: # %bb.0: # %entry ; CHECK-LE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-LE-P7-NEXT: addi r3, r1, -4 +; CHECK-LE-P7-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-LE-P7-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-LE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-LE-P7-NEXT: lwz r3, -4(r1) -; CHECK-LE-P7-NEXT: stw r3, -32(r1) -; CHECK-LE-P7-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-LE-P7-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: addi r3, r1, -32 +; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P7-NEXT: xxswapd v3, vs0 -; CHECK-LE-P7-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P7-NEXT: xxswapd v4, vs0 +; CHECK-LE-P7-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P7-NEXT: xxspltw v4, vs0, 1 ; CHECK-LE-P7-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P7-NEXT: blr ; @@ -272,16 +248,12 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) { ; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 -; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: sldi r3, r3, 32 -; CHECK-BE-P7-NEXT: std r3, -32(r1) -; CHECK-BE-P7-NEXT: std r3, -24(r1) +; CHECK-BE-P7-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI3_0@toc@ha ; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P7-NEXT: addi r3, r1, -32 ; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P7-NEXT: xxspltw v3, vs0, 1 +; CHECK-BE-P7-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test4: diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll index 11cc8abd2c7fa..31d0960e19f4e 100644 --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -2045,31 +2045,25 @@ define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapt ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: li r5, 4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 ; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, r4, r5 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw v3, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: li r3, 4 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs2, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, r4, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) ; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll index 8bb71e073e814..56c8c128ba9f4 100644 --- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll @@ -1685,43 +1685,29 @@ define <2 x i64> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapt ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r3) -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -48(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -64(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -48 +; CHECK-AIX-32-P8-NEXT: li r5, 4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, r3, r5 +; CHECK-AIX-32-P8-NEXT: lfiwzx f2, r4, r5 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs3, vs3, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs2, vs2, 1 ; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -64 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs3, vs2 ; CHECK-AIX-32-P8-NEXT: xxmrghd v3, v2, vs0 ; CHECK-AIX-32-P8-NEXT: vaddudm v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r5, 4(r3) -; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: li r5, 4 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, r3, r5 ; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 -; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -64(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -64(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, r4, r5 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r4 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: xxmrghd v3, v2, vs0 ; CHECK-AIX-32-P9-NEXT: vaddudm v2, v3, v2 diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll index 4ca55d276647b..c8e7b20e4b8c3 100644 --- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll @@ -743,25 +743,21 @@ define void @test_v8i16_v4i32(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -842,25 +838,21 @@ define void @test_v8i16_v2i64(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -1030,25 +1022,21 @@ define void @test_v4i32_v8i16(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -1125,26 +1113,18 @@ define void @test_v4i32_v2i64(ptr %a) { ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 ; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -1212,14 +1192,11 @@ define void @test_v2i64_v2i64(ptr %a) { ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r4, 4(r3) -; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: li r4, 4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, r3, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 @@ -1229,12 +1206,9 @@ define void @test_v2i64_v2i64(ptr %a) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r3) -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: li r4, 4 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, r3, r4 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 @@ -1308,26 +1282,18 @@ define void @test_v2i64_v4i32(ptr %a) { ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 ; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -1407,25 +1373,21 @@ define void @test_v2i64_v8i16(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll index 201bc5be54506..e1aa531db449e 100644 --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -654,17 +654,14 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) { ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r4, 4(r3) +; CHECK-AIX-32-P8-NEXT: li r4, 4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxlxor v4, v4, v4 -; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, r3, r4 ; CHECK-AIX-32-P8-NEXT: lwz r3, L..C6(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs1, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 @@ -672,14 +669,11 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r3) +; CHECK-AIX-32-P9-NEXT: li r4, 4 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 ; CHECK-AIX-32-P9-NEXT: xxlxor vs2, vs2, vs2 -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, r3, r4 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm vs0, vs2, vs1 @@ -847,24 +841,20 @@ define <16 x i8> @test_v8i16_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -937,24 +927,20 @@ define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1149,24 +1135,20 @@ define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1519,24 +1501,20 @@ define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: From a724f9a7e5d46c9bf49c7b5e207f792fb5214c10 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 4 Sep 2024 10:08:05 -0700 Subject: [PATCH 125/425] [SLP][NFC]Make whole reg non-power-2 test for x86 and aarch64 along with risc-v --- .../SLPVectorizer/{RISCV => }/reduction-whole-regs-loads.ll | 3 +++ 1 file changed, 3 insertions(+) rename llvm/test/Transforms/SLPVectorizer/{RISCV => }/reduction-whole-regs-loads.ll (87%) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll similarity index 87% rename from llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll rename to llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll index 54dc33dbc0d00..c077181c35063 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s +; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target define i64 @test(ptr %p) { ; CHECK-LABEL: @test( From 2092f3527ed743a8fb9e0858c839cd4b26907f2a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 4 Sep 2024 10:20:13 -0700 Subject: [PATCH 126/425] [SLP][NFC]Remove unsupported attribute --- .../Transforms/SLPVectorizer/reduction-whole-regs-loads.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll index c077181c35063..281b5f99540ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s ; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target define i64 @test(ptr %p) { From 601645c3b70e2a17d18779a3a51b8bc9ecdc9aa6 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Wed, 4 Sep 2024 16:52:49 +0000 Subject: [PATCH 127/425] [clang] Fix FIXME in dynamic initializer emission, NFCI This potentially affects platforms that support comdats other than ELF, COFF, or wasm, but that is the intention of the FIXME, and if they don't want this behavior, they probably shouldn't advertise comdat support. --- clang/lib/CodeGen/CGDeclCXX.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 8dcb5f6100619..c44f38ef02a3f 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -640,13 +640,13 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D, addUsedGlobal(COMDATKey); } - // If we used a COMDAT key for the global ctor, the init function can be - // discarded if the global ctor entry is discarded. - // FIXME: Do we need to restrict this to ELF and Wasm? + // If comdats are in use and supported, place the initializer function into + // the comdat group of the global. In the MS ABI, initializers are mangled + // and have their own comdat, so we don't include them in the group for + // consistency with MSVC. llvm::Comdat *C = Addr->getComdat(); - if (COMDATKey && C && - (getTarget().getTriple().isOSBinFormatELF() || - getTarget().getTriple().isOSBinFormatWasm())) { + if (COMDATKey && C && getTriple().supportsCOMDAT() && + !getTarget().getCXXABI().isMicrosoft()) { Fn->setComdat(C); } } else { From 9a2fd97d391caf1060e303f636d7113501788d2f Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 4 Sep 2024 10:41:09 -0700 Subject: [PATCH 128/425] Reapply^2 "[HWASan] remove incorrectly inferred attributes" (#106622) (#106816) This reverts commit 66927fb95abef9327b453d7213c5df7d641269be. Filter functions this applies to, which I initially wanted to do in a follow up to make reverts easier, but turns out without that it gets really slow Fleetbench proto: no significant movement Fleetbench hashing: no significant movement Fleetbench libc: no significant movement 2nd stage LLVM build: https://lab.llvm.org/buildbot/#/builders/55/builds/1765/steps/9/logs/stdio after this change: 80833.56user 3303.04system previous build: 78430.21user 3258.04system --- .../Instrumentation/HWAddressSanitizer.cpp | 43 +++- .../HWAddressSanitizer/RISCV/alloca.ll | 138 +++++----- .../HWAddressSanitizer/RISCV/basic.ll | 242 +++++++++--------- .../HWAddressSanitizer/alloca.ll | 138 +++++----- .../HWAddressSanitizer/basic.ll | 180 ++++++------- .../HWAddressSanitizer/mem-attr.ll | 10 +- 6 files changed, 394 insertions(+), 357 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 69e5835bee8a5..a7e7f9a570dac 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -598,6 +598,41 @@ void HWAddressSanitizer::initializeModule() { LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); TargetTriple = Triple(M.getTargetTriple()); + for (auto &F : M.functions()) { + // Remove memory attributes that are invalid with HWASan. + // HWASan checks read from shadow, which invalidates memory(argmem: *) + // Short granule checks on function arguments read from the argument memory + // (last byte of the granule), which invalidates writeonly. + // + // This is not only true for sanitized functions, because AttrInfer can + // infer those attributes on libc functions, which is not true if those + // are instrumented (Android) or intercepted. + + // The API is weird. `onlyReadsMemory` actually means "does not write", and + // `onlyWritesMemory` actually means "does not read". So we reconstruct + // "accesses memory" && "does not read" <=> "writes". + bool Changed = false; + if (!F.doesNotAccessMemory()) { + bool WritesMemory = !F.onlyReadsMemory(); + bool ReadsMemory = !F.onlyWritesMemory(); + if ((WritesMemory && !ReadsMemory) || F.onlyAccessesArgMemory()) { + F.removeFnAttr(Attribute::Memory); + Changed = true; + } + } + for (Argument &A : F.args()) { + if (A.hasAttribute(Attribute::WriteOnly)) { + Changed = true; + A.removeAttr(Attribute::WriteOnly); + } + } + if (Changed) { + // nobuiltin makes sure later passes don't restore assumptions about + // the function. + F.addFnAttr(Attribute::NoBuiltin); + } + } + // x86_64 currently has two modes: // - Intel LAM (default) // - pointer aliasing (heap only) @@ -1622,14 +1657,6 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, assert(!ShadowBase); - // Remove memory attributes that are about to become invalid. - // HWASan checks read from shadow, which invalidates memory(argmem: *) - // Short granule checks on function arguments read from the argument memory - // (last byte of the granule), which invalidates writeonly. - F.removeFnAttr(llvm::Attribute::Memory); - for (auto &A : F.args()) - A.removeAttr(llvm::Attribute::WriteOnly); - BasicBlock::iterator InsertPt = F.getEntryBlock().begin(); IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt); emitPrologue(EntryIRB, diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll index 23b1043c70016..5fd9dc6eede21 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll @@ -33,7 +33,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca -; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca -; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; entry: %x = alloca i32, align 4 @@ -147,15 +147,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; DYNAMIC-SHADOW: [[META9]] = !{null} -; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; DYNAMIC-SHADOW: [[META10]] = !{null} +; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -163,13 +164,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META9]] = !{null} -; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META10]] = !{null} +; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll index 9cebe2e845f77..5415b08128663 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll @@ -9,8 +9,6 @@ ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW -; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor] -; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux" @@ -32,7 +30,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; CHECK-NEXT: br label [[TMP13]] @@ -68,7 +66,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP13]] @@ -88,7 +86,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -108,10 +106,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -120,13 +118,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -145,7 +143,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -165,10 +163,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -177,13 +175,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -212,7 +210,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; CHECK-NEXT: br label [[TMP13]] @@ -248,7 +246,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP13]] @@ -268,7 +266,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -288,10 +286,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -300,13 +298,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -325,7 +323,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -345,10 +343,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -357,13 +355,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -392,7 +390,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; CHECK-NEXT: br label [[TMP13]] @@ -428,7 +426,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP13]] @@ -448,7 +446,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -468,10 +466,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -480,13 +478,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -505,7 +503,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -525,10 +523,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -537,13 +535,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -572,7 +570,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; CHECK-NEXT: br label [[TMP13]] @@ -608,7 +606,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP13]] @@ -628,7 +626,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -648,10 +646,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -660,13 +658,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -685,7 +683,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -705,10 +703,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -717,13 +715,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -752,7 +750,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; CHECK-NEXT: br label [[TMP13]] @@ -788,7 +786,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP13]] @@ -808,7 +806,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -828,10 +826,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -840,13 +838,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -865,7 +863,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -885,10 +883,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -897,13 +895,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1013,7 +1011,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; CHECK-NEXT: br label [[TMP13]] @@ -1049,7 +1047,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1069,7 +1067,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1089,10 +1087,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1101,13 +1099,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1126,7 +1124,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1146,10 +1144,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1158,13 +1156,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1193,7 +1191,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; CHECK-NEXT: br label [[TMP13]] @@ -1229,7 +1227,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1249,7 +1247,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1269,10 +1267,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1281,13 +1279,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1306,7 +1304,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1326,10 +1324,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1338,13 +1336,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1373,7 +1371,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; CHECK-NEXT: br label [[TMP13]] @@ -1409,7 +1407,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1429,7 +1427,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1449,10 +1447,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1461,13 +1459,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1486,7 +1484,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1506,10 +1504,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1518,13 +1516,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1553,7 +1551,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; CHECK-NEXT: br label [[TMP13]] @@ -1589,7 +1587,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1609,7 +1607,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1629,10 +1627,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1641,13 +1639,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1666,7 +1664,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1686,10 +1684,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1698,13 +1696,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1733,7 +1731,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; CHECK-NEXT: br label [[TMP13]] @@ -1769,7 +1767,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1789,7 +1787,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1809,10 +1807,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1821,13 +1819,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1846,7 +1844,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1866,10 +1864,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1878,13 +1876,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll index 4bd23ea76c159..73f56de707b21 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll @@ -34,7 +34,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; entry: %x = alloca i32, align 4 @@ -166,15 +166,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; DYNAMIC-SHADOW: [[META9]] = !{null} -; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; DYNAMIC-SHADOW: [[META10]] = !{null} +; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -182,13 +183,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META9]] = !{null} -; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META10]] = !{null} +; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll index 4212293f42545..afbb8f5001114 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll @@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP9]] @@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP9]] @@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP9]] @@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP9]] @@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP9]] @@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP9]] @@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP9]] @@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll index c0e370f20213a..724e6c5a0bdec 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll @@ -11,5 +11,13 @@ entry: ret void } -; CHECK: attributes #0 = { sanitize_hwaddress uwtable } +; CHECK: define dso_local void @test_readonly(ptr nocapture noundef readonly %p) local_unnamed_addr #0 +define dso_local void @test_readonly(ptr nocapture noundef readonly %p) local_unnamed_addr #1 { +entry: + store i32 42, ptr %p, align 4 + ret void +} + +; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable } attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable } +attributes #1 = { sanitize_hwaddress memory(argmem: read) uwtable } From 4228e28293458e6ec49bd5487210719ff33c319a Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:51:52 -0700 Subject: [PATCH 129/425] [flang] Fix crash in semantics (#106158) Semantics crashes when merging a USE-associated derived type with a local generic procedure interface of the same name. (The other direction works.) --- flang/lib/Semantics/resolve-names.cpp | 9 +++-- flang/test/Semantics/generic09.f90 | 47 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 flang/test/Semantics/generic09.f90 diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index ec8f854f64d10..2e86e0afc9bd0 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -3131,7 +3131,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, combinedDerivedType = useDerivedType; } else { const Scope *localScope{localDerivedType->scope()}; - const Scope *useScope{useDerivedType->scope()}; + const Scope *useScope{useDerivedType->GetUltimate().scope()}; if (localScope && useScope && localScope->derivedTypeSpec() && useScope->derivedTypeSpec() && evaluate::AreSameDerivedType( @@ -3307,7 +3307,12 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, AddGenericUse(newUseGeneric, localName, useUltimate); newUseGeneric.AddUse(*localSymbol); if (combinedDerivedType) { - newUseGeneric.set_derivedType(*const_cast(combinedDerivedType)); + if (const auto *oldDT{newUseGeneric.derivedType()}) { + CHECK(&oldDT->GetUltimate() == &combinedDerivedType->GetUltimate()); + } else { + newUseGeneric.set_derivedType( + *const_cast(combinedDerivedType)); + } } if (combinedProcedure) { newUseGeneric.set_specific(*const_cast(combinedProcedure)); diff --git a/flang/test/Semantics/generic09.f90 b/flang/test/Semantics/generic09.f90 new file mode 100644 index 0000000000000..6159dd4b701d7 --- /dev/null +++ b/flang/test/Semantics/generic09.f90 @@ -0,0 +1,47 @@ +! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s +module m1 + type foo + integer n + integer :: m = 1 + end type +end + +module m2 + use m1 + interface foo + module procedure f1 + end interface + contains + type(foo) function f1(a) + real, intent(in) :: a + f1%n = a + f1%m = 2 + end +end + +module m3 + use m2 + interface foo + module procedure f2 + end interface + contains + type(foo) function f2(a) + double precision, intent(in) :: a + f2%n = a + f2%m = 3 + end +end + +program main + use m3 + type(foo) x +!CHECK: foo(n=1_4,m=1_4) + x = foo(1) + print *, x +!CHECK: f1(2._4) + x = foo(2.) + print *, x +!CHECK: f2(3._8) + x = foo(3.d0) + print *, x +end From 6facf6981488700c1554dcce36d4ac774a91d568 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:52:20 -0700 Subject: [PATCH 130/425] [flang][runtime] Correct RANDOM_INIT seed generation (#106250) The initial seed was generated from a bitwise AND ("&") of two clock-generated values, instead of an XOR or (best) a truncated integer multiplication. Maybe I mistyped a shift-7 instead of a shift-6 or shift-8 when I wrote that line, but it was most likely just stupidity. Fixes https://github.com/llvm/llvm-project/issues/106221. --- flang/runtime/random.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp index e0a421fd28396..69de9b8c96fb5 100644 --- a/flang/runtime/random.cpp +++ b/flang/runtime/random.cpp @@ -42,7 +42,7 @@ void RTNAME(RandomInit)(bool repeatable, bool /*image_distinct*/) { #ifdef CLOCK_REALTIME timespec ts; clock_gettime(CLOCK_REALTIME, &ts); - generator.seed(ts.tv_sec & ts.tv_nsec); + generator.seed(ts.tv_sec ^ ts.tv_nsec); #else generator.seed(time(nullptr)); #endif From 9e53e77265769f1916d8c4fd8ed8263798e8e815 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:52:51 -0700 Subject: [PATCH 131/425] [flang] Fix warnings from more recent GCCs (#106567) While experimenting with some more recent C++ features, I ran into trouble with warnings from GCC 12.3.0 and 14.2.0. These warnings looked legitimate, so I've tweaked the code to avoid them. --- flang/include/flang/Evaluate/integer.h | 6 +++--- flang/include/flang/Runtime/descriptor.h | 2 +- flang/lib/Lower/ConvertExpr.cpp | 3 ++- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 +- .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 3 ++- flang/lib/Semantics/check-omp-structure.cpp | 15 ++++++++------- flang/lib/Semantics/tools.cpp | 2 +- flang/unittests/Runtime/Reduction.cpp | 6 +++--- flang/unittests/Runtime/Transformational.cpp | 10 +++++----- 9 files changed, 26 insertions(+), 23 deletions(-) diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h index e69876f07b63c..e420eb75e3dff 100644 --- a/flang/include/flang/Evaluate/integer.h +++ b/flang/include/flang/Evaluate/integer.h @@ -828,9 +828,9 @@ class Integer { if (Part ypart{y.LEPart(k)}) { BigPart xy{xpart}; xy *= ypart; -#if defined __GNUC__ && __GNUC__ < 8 - // && to < (2 * parts) was added to avoid GCC < 8 build failure on - // -Werror=array-bounds. This can be removed if -Werror is disable. +#if defined __GNUC__ && __GNUC__ < 8 || __GNUC__ >= 12 + // && to < (2 * parts) was added to avoid GCC build failure on + // -Werror=array-bounds. This can be removed if -Werror is disabled. for (int to{j + k}; xy != 0 && to < (2 * parts); ++to) { #else for (int to{j + k}; xy != 0; ++to) { diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h index 043f6931afad9..030d0c1031fba 100644 --- a/flang/include/flang/Runtime/descriptor.h +++ b/flang/include/flang/Runtime/descriptor.h @@ -438,7 +438,7 @@ class Descriptor { } RT_API_ATTRS inline void SetAllocIdx(int pos) { raw_.extra &= ~_CFI_ALLOCATOR_IDX_MASK; // Clear the allocator index bits. - raw_.extra |= (pos << _CFI_ALLOCATOR_IDX_SHIFT); + raw_.extra |= pos << _CFI_ALLOCATOR_IDX_SHIFT; } private: diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index 7dd317d64436b..62a7615e1af13 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -5590,7 +5590,8 @@ class ArrayExprLowering { ty = unwrapBoxEleTy(ty); mlir::Location loc = getLoc(); mlir::IndexType idxTy = builder.getIndexType(); - for (auto extent : mlir::cast(ty).getShape()) { + auto seqType = mlir::cast(ty); + for (auto extent : seqType.getShape()) { auto v = extent == fir::SequenceType::getUnknownExtent() ? builder.create(loc, idxTy).getResult() : builder.createIntegerConstant(loc, idxTy, extent); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index e419b26125299..eb91969236ae0 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1273,7 +1273,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion { } else { // Compute the value of the extra field based on allocator_idx and // addendum present using a Descriptor object. - Fortran::runtime::StaticDescriptor<0> staticDescriptor; + Fortran::runtime::StaticDescriptor staticDescriptor; Fortran::runtime::Descriptor &desc{staticDescriptor.descriptor()}; desc.raw().extra = 0; desc.SetAllocIdx(allocatorIdx); diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index 98205959020d2..536f2077e4f70 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -411,7 +411,8 @@ class DesignateOpConversion llvm::SmallVector firstElementIndices; auto indices = designate.getIndices(); int i = 0; - for (auto isTriplet : designate.getIsTripletAttr().asArrayRef()) { + auto attrs = designate.getIsTripletAttr(); + for (auto isTriplet : attrs.asArrayRef()) { // Coordinate of the first element are the index and triplets lower // bounds firstElementIndices.push_back(indices[i]); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 50840898438c7..643b713b32e29 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1796,13 +1796,12 @@ inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch( const auto *e{GetExpr(context_, expr)}; const auto *v{GetExpr(context_, var)}; if (e && v) { - const Symbol &varSymbol = evaluate::GetSymbolVector(*v).front(); + auto vSyms{evaluate::GetSymbolVector(*v)}; + const Symbol &varSymbol = vSyms.front(); for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) { if (varSymbol == symbol) { context_.Say(expr.source, - "RHS expression " - "on atomic assignment statement" - " cannot access '%s'"_err_en_US, + "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, var.GetSource().ToString()); } } @@ -1942,12 +1941,14 @@ void OmpStructureChecker::CheckAtomicUpdateStmt( "Expected scalar variable " "on the LHS of atomic update assignment " "statement"_err_en_US); - const Symbol &varSymbol = evaluate::GetSymbolVector(*v).front(); + auto vSyms{evaluate::GetSymbolVector(*v)}; + const Symbol &varSymbol = vSyms.front(); int numOfSymbolMatches{0}; - SymbolVector exprSymbols = evaluate::GetSymbolVector(*e); + SymbolVector exprSymbols{evaluate::GetSymbolVector(*e)}; for (const Symbol &symbol : exprSymbols) { - if (varSymbol == symbol) + if (varSymbol == symbol) { numOfSymbolMatches++; + } } if (isIntrinsicProcedure) { std::string varName = var.GetSource().ToString(); diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 845183f2d9b25..8d16ab7100876 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -1354,7 +1354,7 @@ ComponentIterator::const_iterator::BuildResultDesignatorName() const { std::string designator; for (const auto &node : componentPath_) { - designator += "%" + DEREF(node.component()).name().ToString(); + designator += "%"s + DEREF(node.component()).name().ToString(); } return designator; } diff --git a/flang/unittests/Runtime/Reduction.cpp b/flang/unittests/Runtime/Reduction.cpp index 41c8d86c35b76..25eb5fd760ead 100644 --- a/flang/unittests/Runtime/Reduction.cpp +++ b/flang/unittests/Runtime/Reduction.cpp @@ -42,7 +42,7 @@ TEST(Reductions, DimMaskProductInt4) { shape, std::vector{1, 2, 3, 4, 5, 6})}; auto mask{MakeArray( shape, std::vector{true, false, false, true, true, true})}; - StaticDescriptor<1, true> statDesc; + StaticDescriptor statDesc; Descriptor &prod{statDesc.descriptor()}; RTNAME(ProductDim)(prod, *array, 1, __FILE__, __LINE__, &*mask); EXPECT_EQ(prod.rank(), 1); @@ -152,7 +152,7 @@ TEST(Reductions, DoubleMaxMinNorm2) { // A scalar result occurs when you have a rank 1 array and dim == 1. std::vector shape1{24}; auto array1{MakeArray(shape1, rawData)}; - StaticDescriptor<1, true> statDesc0[1]; + StaticDescriptor<2, true> statDesc0[1]; Descriptor &scalarResult{statDesc0[0].descriptor()}; RTNAME(MaxlocDim) (scalarResult, *array1, /*KIND=*/2, /*DIM=*/1, __FILE__, __LINE__, @@ -655,7 +655,7 @@ TEST(Reductions, ReduceInt4) { TEST(Reductions, ReduceInt4Dim) { auto intMatrix{MakeArray( std::vector{2, 2}, std::vector{1, 2, 3, 4})}; - StaticDescriptor<1, true> statDesc; + StaticDescriptor<2, true> statDesc; Descriptor &sums{statDesc.descriptor()}; RTNAME(ReduceInteger4DimRef)(sums, *intMatrix, IAdd, __FILE__, __LINE__, 1); EXPECT_EQ(sums.rank(), 1); diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp index 5678ea2515775..5836e70c740f9 100644 --- a/flang/unittests/Runtime/Transformational.cpp +++ b/flang/unittests/Runtime/Transformational.cpp @@ -33,7 +33,7 @@ template static void testBesselJn(BesselFuncType rtFunc, int32_t n1, int32_t n2, CppTypeFor x, const std::vector> &expected) { - StaticDescriptor<1> desc; + StaticDescriptor desc; Descriptor &result{desc.descriptor()}; unsigned len = expected.size(); @@ -60,7 +60,7 @@ static void testBesselJn(BesselFuncType rtFunc, int32_t n1, int32_t n2, template static void testBesselJnX0( BesselX0FuncType rtFunc, int32_t n1, int32_t n2) { - StaticDescriptor<1> desc; + StaticDescriptor desc; Descriptor &result{desc.descriptor()}; rtFunc(result, n1, n2, __FILE__, __LINE__); @@ -131,7 +131,7 @@ template static void testBesselYn(BesselFuncType rtFunc, int32_t n1, int32_t n2, CppTypeFor x, const std::vector> &expected) { - StaticDescriptor<1> desc; + StaticDescriptor desc; Descriptor &result{desc.descriptor()}; unsigned len = expected.size(); @@ -158,7 +158,7 @@ static void testBesselYn(BesselFuncType rtFunc, int32_t n1, int32_t n2, template static void testBesselYnX0( BesselX0FuncType rtFunc, int32_t n1, int32_t n2) { - StaticDescriptor<1> desc; + StaticDescriptor<2> desc; Descriptor &result{desc.descriptor()}; rtFunc(result, n1, n2, __FILE__, __LINE__); @@ -383,7 +383,7 @@ TEST(Transformational, Pack) { std::vector{false, true, true, false, false, true})}; mask->GetDimension(0).SetLowerBound(0); // shouldn't matter mask->GetDimension(1).SetLowerBound(2); - StaticDescriptor<1, true> statDesc; + StaticDescriptor statDesc; Descriptor &result{statDesc.descriptor()}; RTNAME(Pack)(result, *array, *mask, nullptr, __FILE__, __LINE__); From 500f6cc25cb93607e9ea13732b791297acf8f97f Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:53:22 -0700 Subject: [PATCH 132/425] [flang][runtime] Support SPACING for REAL(2 & 3) (#106575) Add runtime APIs for the intrinsic function SPACING for REAL kinds 2 & 3 in two ways: Spacing2 (& 3) for build environments with std::float16_t, and Spacing2By4 (& 3By4) variants (for any build environment) which compute SPACING for those types but accept and return their values as 32-bit floats. SPACING for REAL(2) is needed by HDF5. --- flang/include/flang/Runtime/cpp-type.h | 36 +++++++++++++++++++++++--- flang/include/flang/Runtime/numeric.h | 15 +++++++++++ flang/runtime/numeric-templates.h | 9 +++++-- flang/runtime/numeric.cpp | 20 ++++++++++++++ flang/unittests/Runtime/Numeric.cpp | 5 ++++ 5 files changed, 79 insertions(+), 6 deletions(-) diff --git a/flang/include/flang/Runtime/cpp-type.h b/flang/include/flang/Runtime/cpp-type.h index 5141d0691c5c6..fe21dd544cf7d 100644 --- a/flang/include/flang/Runtime/cpp-type.h +++ b/flang/include/flang/Runtime/cpp-type.h @@ -14,11 +14,20 @@ #include "flang/Common/Fortran.h" #include "flang/Common/float128.h" #include "flang/Common/uint128.h" -#include #include #include +#if __cplusplus >= 202302 +#include +#endif #include +#if !defined HAS_FP16 && __STDCPP_FLOAT16_T__ +#define HAS_FP16 1 +#endif +#if !defined HAS_BF16 && __STDCPP_BFLOAT16_T__ +#define HAS_BF16 1 +#endif + namespace Fortran::runtime { using common::TypeCategory; @@ -37,24 +46,43 @@ template struct CppTypeForHelper { using type = common::HostSignedIntType<8 * KIND>; }; -// TODO: REAL/COMPLEX(2 & 3) +#if HAS_FP16 +template <> struct CppTypeForHelper { + using type = std::float16_t; +}; +#endif +#if HAS_BF16 +template <> struct CppTypeForHelper { + using type = std::bfloat16_t; +}; +#endif template <> struct CppTypeForHelper { +#if __STDCPP_FLOAT32_T__ + using type = std::float32_t; +#else using type = float; +#endif }; template <> struct CppTypeForHelper { +#if __STDCPP_FLOAT64_T__ + using type = std::float64_t; +#else using type = double; +#endif }; #if LDBL_MANT_DIG == 64 template <> struct CppTypeForHelper { using type = long double; }; #endif -#if LDBL_MANT_DIG == 113 +#if __STDCPP_FLOAT128_T__ +using CppFloat128Type = std::float128_t; +#elif LDBL_MANT_DIG == 113 using CppFloat128Type = long double; #elif HAS_FLOAT128 using CppFloat128Type = __float128; #endif -#if LDBL_MANT_DIG == 113 || HAS_FLOAT128 +#if __STDCPP_FLOAT128_t || LDBL_MANT_DIG == 113 || HAS_FLOAT128 template <> struct CppTypeForHelper { using type = CppFloat128Type; }; diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h index 6e1979790e3c6..84a5a7cd7a361 100644 --- a/flang/include/flang/Runtime/numeric.h +++ b/flang/include/flang/Runtime/numeric.h @@ -391,6 +391,21 @@ CppTypeFor RTDECL(SelectedRealKindMasked)( const char *, int, void *, int, void *, int, void *, int, int); // SPACING +// The variants Spacing2By4 and Spacing3By4 compute SPACING for REAL(2/3) +// but accept and return REAL(4) values, for use in environments where +// std::float16_t or std::bfloat16_t are unavailable. +#if HAS_FP16 +CppTypeFor RTDECL(Spacing2)( + CppTypeFor); +#endif +CppTypeFor RTDECL(Spacing2By4)( + CppTypeFor); +#if HAS_BF16 +CppTypeFor RTDECL(Spacing3)( + CppTypeFor); +#endif +CppTypeFor RTDECL(Spacing3By4)( + CppTypeFor); CppTypeFor RTDECL(Spacing4)( CppTypeFor); CppTypeFor RTDECL(Spacing8)( diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h index 1b5395df94519..1b43498a6bfd1 100644 --- a/flang/runtime/numeric-templates.h +++ b/flang/runtime/numeric-templates.h @@ -343,10 +343,15 @@ template inline RT_API_ATTRS T Spacing(T x) { return x; // NaN -> same NaN } else if (ISINFTy::compute(x)) { return QNANTy::compute(); // +/-Inf -> NaN - } else if (x == 0) { + } else if (x == 0) { // 0 -> TINY(x) // The standard-mandated behavior seems broken, since TINY() can't be // subnormal. - return MINTy::compute(); // 0 -> TINY(x) + if constexpr (PREC == 11) { // REAL(2) + return 0.00006103515625E-04; // TINY(0._2) + } else { + // N.B. TINY(0._3) == TINY(0._4) so this works even if no std::bfloat16_t. + return MINTy::compute(); + } } else { T result{LDEXPTy::compute( static_cast(1.0), ILOGBTy::compute(x) + 1 - PREC)}; // 2**(e-p) diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index b5e0851a16cd1..9a8ddc6615564 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -848,6 +848,26 @@ CppTypeFor RTDEF(SelectedRealKindMasked)( return SelectedRealKind(p, r, d, mask); } +#if HAS_FP16 +CppTypeFor RTDEF(Spacing2)( + CppTypeFor x) { + return Spacing<11>(x); +} +#endif +CppTypeFor RTDEF(Spacing2By4)( + CppTypeFor x) { + return Spacing<11>(x); +} +#if HAS_BF16 +CppTypeFor RTDEF(Spacing3)( + CppTypeFor x) { + return Spacing<8>(x); +} +#endif +CppTypeFor RTDEF(Spacing3By4)( + CppTypeFor x) { + return Spacing<8>(x); +} CppTypeFor RTDEF(Spacing4)( CppTypeFor x) { return Spacing<24>(x); diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp index 9f77e16570783..799756aab3839 100644 --- a/flang/unittests/Runtime/Numeric.cpp +++ b/flang/unittests/Runtime/Numeric.cpp @@ -259,6 +259,11 @@ TEST(Numeric, Spacing) { std::isnan(RTNAME(Spacing4)(std::numeric_limits>::infinity()))); EXPECT_TRUE( std::isnan(RTNAME(Spacing8)(std::numeric_limits>::quiet_NaN()))); + EXPECT_EQ(RTNAME(Spacing2By4)(Real<4>{3.0}), std::ldexp(Real<4>{1.0}, -9)); + EXPECT_EQ(RTNAME(Spacing2By4)(Real<4>{0.0}), Real<4>{0.00006103515625E-04}); + EXPECT_EQ(RTNAME(Spacing3By4)(Real<4>{3.0}), std::ldexp(Real<4>{1.0}, -6)); + EXPECT_EQ( + RTNAME(Spacing3By4)(Real<4>{0.0}), std::numeric_limits>::min()); } TEST(Numeric, FPowI) { From 143f3fc40279cbdafce190c5516c9dd74fc22ae5 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:54:00 -0700 Subject: [PATCH 133/425] [flang] Accept a non-breaking space character in source (#106611) Accept non-breaking space characters (Latin-1 '\xa0', UTF-8 '\xc2' '\xa0') in source code, converting them into regular spaces in the cooked character stream when not in character literals. --- flang/lib/Parser/prescan.cpp | 74 +++++++++++++++--------- flang/test/Parser/non-breaking-space.f90 | 6 ++ 2 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 flang/test/Parser/non-breaking-space.f90 diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 804ada7d11e02..a0cd0ff263f92 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -46,6 +46,23 @@ Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective) compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_}, compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {} +// Returns number of bytes to skip +static inline int IsSpace(const char *p) { + if (*p == ' ') { + return 1; + } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space + return 1; + } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP + return 2; + } else { + return 0; + } +} + +static inline int IsSpaceOrTab(const char *p) { + return *p == '\t' ? 1 : IsSpace(p); +} + static inline constexpr bool IsFixedFormCommentChar(char ch) { return ch == '!' || ch == '*' || ch == 'C' || ch == 'c'; } @@ -126,8 +143,8 @@ void Prescanner::Statement() { if (inFixedForm_) { CHECK(IsFixedFormCommentChar(*at_)); } else { - while (*at_ == ' ' || *at_ == '\t') { - ++at_, ++column_; + while (int n{IsSpaceOrTab(at_)}) { + at_ += n, ++column_; } CHECK(*at_ == '!'); } @@ -159,10 +176,10 @@ void Prescanner::Statement() { ++sp, ++at_, ++column_) { EmitChar(tokens, *sp); } - if (*at_ == ' ' || *at_ == '\t') { + if (IsSpaceOrTab(at_)) { EmitChar(tokens, ' '); - while (*at_ == ' ' || *at_ == '\t') { - ++at_, ++column_; + while (int n{IsSpaceOrTab(at_)}) { + at_ += n, ++column_; } } tokens.CloseToken(); @@ -361,7 +378,7 @@ void Prescanner::LabelField(TokenSequence &token) { column_ = 7; break; } - if (*at_ != ' ' && + if (int n{IsSpace(at_)}; n == 0 && !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space EmitChar(token, *at_); ++outCol; @@ -493,7 +510,9 @@ bool Prescanner::MustSkipToEndOfLine() const { void Prescanner::NextChar() { CHECK(*at_ != '\n'); - ++at_, ++column_; + int n{IsSpace(at_)}; + at_ += n ? n : 1; + ++column_; while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') { // UTF-8 byte order mark - treat this file as UTF-8 at_ += 3; @@ -556,23 +575,23 @@ void Prescanner::SkipCComments() { } void Prescanner::SkipSpaces() { - while (*at_ == ' ' || *at_ == '\t') { + while (IsSpaceOrTab(at_)) { NextChar(); } insertASpace_ = false; } const char *Prescanner::SkipWhiteSpace(const char *p) { - while (*p == ' ' || *p == '\t') { - ++p; + while (int n{IsSpaceOrTab(p)}) { + p += n; } return p; } const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const { while (true) { - if (*p == ' ' || *p == '\t') { - ++p; + if (int n{IsSpaceOrTab(p)}) { + p += n; } else if (IsCComment(p)) { if (const char *after{SkipCComment(p)}) { p = after; @@ -613,7 +632,7 @@ bool Prescanner::NextToken(TokenSequence &tokens) { } SkipCComments(); } - if (*at_ == ' ' || *at_ == '\t') { + if (IsSpaceOrTab(at_)) { // Compress free-form white space into a single space character. const auto theSpace{at_}; char previous{at_ <= start_ ? ' ' : at_[-1]}; @@ -976,8 +995,8 @@ bool Prescanner::IsFixedFormCommentLine(const char *start) const { } bool anyTabs{false}; while (true) { - if (*p == ' ') { - ++p; + if (int n{IsSpace(p)}) { + p += n; } else if (*p == '\t') { anyTabs = true; ++p; @@ -1089,7 +1108,8 @@ void Prescanner::FortranInclude(const char *firstQuote) { const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const { const char *p{start}; - for (; *p == ' '; ++p) { + while (int n{IsSpace(p)}) { + p += n; } if (*p == '#') { if (inFixedForm_ && p == start + 5) { @@ -1178,9 +1198,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { return nullptr; } } - char col6{nextLine_[5]}; - if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { - if (nextLine_[6] != ' ' && mightNeedSpace) { + const char *col6{nextLine_ + 5}; + if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) { + if (mightNeedSpace && !IsSpace(nextLine_ + 6)) { insertASpace_ = true; } return nextLine_ + 6; @@ -1207,9 +1227,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { features_.IsEnabled(LanguageFeature::OldDebugLines))) && nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' && nextLine_[4] == ' ') { - char col6{nextLine_[5]}; - if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { - if ((col6 == 'i' || col6 == 'I') && IsIncludeLine(nextLine_)) { + const char *col6{nextLine_ + 5}; + if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) { + if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) { // It's An INCLUDE line, not a continuation } else { return nextLine_ + 6; @@ -1356,7 +1376,7 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { char sentinel[5], *sp{sentinel}; int column{2}; for (; column < 6; ++column, ++p) { - if (*p == ' ' || *p == '\n' || *p == '\t') { + if (*p == '\n' || IsSpaceOrTab(p)) { break; } if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) { @@ -1366,8 +1386,10 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { *sp++ = ToLowerCaseLetter(*p); } if (column == 6) { - if (*p == ' ' || *p == '\t' || *p == '0') { + if (*p == '0') { ++p; + } else if (int n{IsSpaceOrTab(p)}) { + p += n; } else { // This is a Continuation line, not an initial directive line. return std::nullopt; @@ -1442,10 +1464,10 @@ std::optional> Prescanner::IsCompilerDirectiveSentinel(const char *p) const { char sentinel[8]; for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) { - if (*p == ' ' || *p == '\t' || *p == '&') { + if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) { if (j > 0) { sentinel[j] = '\0'; - p = SkipWhiteSpace(p + 1); + p = SkipWhiteSpace(p + n); if (*p != '!') { if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) { return std::make_pair(sp, p); diff --git a/flang/test/Parser/non-breaking-space.f90 b/flang/test/Parser/non-breaking-space.f90 new file mode 100644 index 0000000000000..f807d4b637f63 --- /dev/null +++ b/flang/test/Parser/non-breaking-space.f90 @@ -0,0 +1,6 @@ +! RUN: %flang_fc1 -fsyntax-only %s +! This line contains the Latin-1 NBSP (non-breaking space) character '\xa0' +x= 1. +! This line contains the UTF-8 encoding of NBSP ('\xc2' '\xa0') +x= 1. +end From 840da2e8ba7e0f77938adfc6f6d315137542a1b8 Mon Sep 17 00:00:00 2001 From: Sterling-Augustine <56981066+Sterling-Augustine@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:54:17 +0000 Subject: [PATCH 134/425] [SandboxIR] Implement CmpInst, FCmpInst, and ICmpInst (#106301) As in the description. Not sure the macros for "WRAP_XXX" add value or not, but do save some boiler plate. Maybe there is a better way. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 175 ++++++++++++++++- .../llvm/SandboxIR/SandboxIRValues.def | 2 + llvm/include/llvm/SandboxIR/Tracker.h | 15 +- llvm/include/llvm/SandboxIR/Type.h | 2 + llvm/lib/SandboxIR/SandboxIR.cpp | 83 ++++++++ llvm/lib/SandboxIR/Tracker.cpp | 8 + llvm/unittests/SandboxIR/SandboxIRTest.cpp | 178 ++++++++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 43 +++++ 8 files changed, 500 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 5c2d58c1b99dc..89e963498426d 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -54,11 +54,17 @@ // | | // | +- ZExtInst // | -// +- CallBase -----------+- CallBrInst -// | | -// +- CmpInst +- CallInst -// | | -// +- ExtractElementInst +- InvokeInst +// +- CallBase --------+- CallBrInst +// | | +// | +- CallInst +// | | +// | +- InvokeInst +// | +// +- CmpInst ---------+- ICmpInst +// | | +// | +- FCmpInst +// | +// +- ExtractElementInst // | // +- GetElementPtrInst // | @@ -158,6 +164,9 @@ class BinaryOperator; class PossiblyDisjointInst; class AtomicRMWInst; class AtomicCmpXchgInst; +class CmpInst; +class ICmpInst; +class FCmpInst; /// Iterator for the `Use` edges of a User's operands. /// \Returns the operand `Use` when dereferenced. @@ -304,6 +313,7 @@ class Value { friend class PHINode; // For getting `Val`. friend class UnreachableInst; // For getting `Val`. friend class CatchSwitchAddHandler; // For `Val`. + friend class CmpInst; // For getting `Val`. friend class ConstantArray; // For `Val`. friend class ConstantStruct; // For `Val`. @@ -1076,6 +1086,7 @@ class Instruction : public sandboxir::User { friend class CastInst; // For getTopmostLLVMInstruction(). friend class PHINode; // For getTopmostLLVMInstruction(). friend class UnreachableInst; // For getTopmostLLVMInstruction(). + friend class CmpInst; // For getTopmostLLVMInstruction(). /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program /// order. @@ -1232,6 +1243,7 @@ template class SingleLLVMInstructionImpl : public Instruction { friend class UnaryInstruction; friend class CallBase; friend class FuncletPadInst; + friend class CmpInst; Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final { return getOperandUseDefault(OpIdx, Verify); @@ -3425,6 +3437,151 @@ class PHINode final : public SingleLLVMInstructionImpl { // uint32_t ToIdx = 0) }; +// Wraps a static function that takes a single Predicate parameter +// LLVMValType should be the type of the wrapped class +#define WRAP_STATIC_PREDICATE(FunctionName) \ + static auto FunctionName(Predicate P) { return LLVMValType::FunctionName(P); } +// Wraps a member function that takes no parameters +// LLVMValType should be the type of the wrapped class +#define WRAP_MEMBER(FunctionName) \ + auto FunctionName() const { return cast(Val)->FunctionName(); } +// Wraps both--a common idiom in the CmpInst classes +#define WRAP_BOTH(FunctionName) \ + WRAP_STATIC_PREDICATE(FunctionName) \ + WRAP_MEMBER(FunctionName) + +class CmpInst : public SingleLLVMInstructionImpl { +protected: + using LLVMValType = llvm::CmpInst; + /// Use Context::createCmpInst(). Don't call the constructor directly. + CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc) + : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {} + friend Context; // for CmpInst() + static Value *createCommon(Value *Cond, Value *True, Value *False, + const Twine &Name, IRBuilder<> &Builder, + Context &Ctx); + +public: + using Predicate = llvm::CmpInst::Predicate; + + static CmpInst *create(Predicate Pred, Value *S1, Value *S2, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name = ""); + static CmpInst *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2, + const Instruction *FlagsSource, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name = ""); + void setPredicate(Predicate P); + void swapOperands(); + + WRAP_MEMBER(getPredicate); + WRAP_BOTH(isFPPredicate); + WRAP_BOTH(isIntPredicate); + WRAP_STATIC_PREDICATE(getPredicateName); + WRAP_BOTH(getInversePredicate); + WRAP_BOTH(getOrderedPredicate); + WRAP_BOTH(getUnorderedPredicate); + WRAP_BOTH(getSwappedPredicate); + WRAP_BOTH(isStrictPredicate); + WRAP_BOTH(isNonStrictPredicate); + WRAP_BOTH(getStrictPredicate); + WRAP_BOTH(getNonStrictPredicate); + WRAP_BOTH(getFlippedStrictnessPredicate); + WRAP_MEMBER(isCommutative); + WRAP_BOTH(isEquality); + WRAP_BOTH(isRelational); + WRAP_BOTH(isSigned); + WRAP_BOTH(getSignedPredicate); + WRAP_BOTH(getUnsignedPredicate); + WRAP_BOTH(getFlippedSignednessPredicate); + WRAP_BOTH(isTrueWhenEqual); + WRAP_BOTH(isFalseWhenEqual); + WRAP_BOTH(isUnsigned); + WRAP_STATIC_PREDICATE(isOrdered); + WRAP_STATIC_PREDICATE(isUnordered); + + static bool isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) { + return llvm::CmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2); + } + static bool isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) { + return llvm::CmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2); + } + + /// Method for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ICmp || + From->getSubclassID() == ClassID::FCmp; + } + + /// Create a result type for fcmp/icmp + static Type *makeCmpResultType(Type *OpndType); + +#ifndef NDEBUG + void dumpOS(raw_ostream &OS) const override; + LLVM_DUMP_METHOD void dump() const; +#endif +}; + +class ICmpInst : public CmpInst { + /// Use Context::createICmpInst(). Don't call the constructor directly. + ICmpInst(llvm::ICmpInst *CI, Context &Ctx) + : CmpInst(CI, Ctx, ClassID::ICmp, Opcode::ICmp) {} + friend class Context; // For constructor. + using LLVMValType = llvm::ICmpInst; + +public: + void swapOperands(); + + WRAP_BOTH(getSignedPredicate); + WRAP_BOTH(getUnsignedPredicate); + WRAP_BOTH(isEquality); + WRAP_MEMBER(isCommutative); + WRAP_MEMBER(isRelational); + WRAP_STATIC_PREDICATE(isGT); + WRAP_STATIC_PREDICATE(isLT); + WRAP_STATIC_PREDICATE(isGE); + WRAP_STATIC_PREDICATE(isLE); + + static auto predicates() { return llvm::ICmpInst::predicates(); } + static bool compare(const APInt &LHS, const APInt &RHS, + ICmpInst::Predicate Pred) { + return llvm::ICmpInst::compare(LHS, RHS, Pred); + } + + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ICmp; + } +}; + +class FCmpInst : public CmpInst { + /// Use Context::createFCmpInst(). Don't call the constructor directly. + FCmpInst(llvm::FCmpInst *CI, Context &Ctx) + : CmpInst(CI, Ctx, ClassID::FCmp, Opcode::FCmp) {} + friend class Context; // For constructor. + using LLVMValType = llvm::FCmpInst; + +public: + void swapOperands(); + + WRAP_BOTH(isEquality); + WRAP_MEMBER(isCommutative); + WRAP_MEMBER(isRelational); + + static auto predicates() { return llvm::FCmpInst::predicates(); } + static bool compare(const APFloat &LHS, const APFloat &RHS, + FCmpInst::Predicate Pred) { + return llvm::FCmpInst::compare(LHS, RHS, Pred); + } + + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::FCmp; + } +}; + +#undef WRAP_STATIC_PREDICATE +#undef WRAP_MEMBER +#undef WRAP_BOTH + /// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to /// an OpaqueInstr. class OpaqueInst : public SingleLLVMInstructionImpl { @@ -3445,6 +3602,8 @@ class Context { LLVMContext &LLVMCtx; friend class Type; // For LLVMCtx. friend class PointerType; // For LLVMCtx. + friend class CmpInst; // For LLVMCtx. TODO: cleanup when sandboxir::VectorType + // is complete friend class IntegerType; // For LLVMCtx. friend class StructType; // For LLVMCtx. Tracker IRTracker; @@ -3572,6 +3731,12 @@ class Context { friend PHINode; // For createPHINode() UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI); friend UnreachableInst; // For createUnreachableInst() + CmpInst *createCmpInst(llvm::CmpInst *I); + friend CmpInst; // For createCmpInst() + ICmpInst *createICmpInst(llvm::ICmpInst *I); + friend ICmpInst; // For createICmpInst() + FCmpInst *createFCmpInst(llvm::FCmpInst *I); + friend FCmpInst; // For createFCmpInst() public: Context(LLVMContext &LLVMCtx) diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index d2031bbdcfb54..f320f61934efa 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -113,6 +113,8 @@ DEF_INSTR(Cast, OPCODES(\ ), CastInst) DEF_INSTR(PHI, OP(PHI), PHINode) DEF_INSTR(Unreachable, OP(Unreachable), UnreachableInst) +DEF_INSTR(ICmp, OP(ICmp), FCmpInst) +DEF_INSTR(FCmp, OP(FCmp), ICmpInst) // clang-format on #ifdef DEF_VALUE diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index c8a9e99a34341..5fc43db82bd70 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -63,7 +63,7 @@ class CatchSwitchInst; class SwitchInst; class ConstantInt; class ShuffleVectorInst; - +class CmpInst; /// The base class for IR Change classes. class IRChangeBase { protected: @@ -130,6 +130,19 @@ class PHIAddIncoming : public IRChangeBase { #endif }; +class CmpSwapOperands : public IRChangeBase { + CmpInst *Cmp; + +public: + CmpSwapOperands(CmpInst *Cmp); + void revert(Tracker &Tracker) final; + void accept() final {} +#ifndef NDEBUG + void dump(raw_ostream &OS) const final { OS << "CmpSwapOperands"; } + LLVM_DUMP_METHOD void dump() const final; +#endif +}; + /// Tracks swapping a Use with another Use. class UseSwap : public IRChangeBase { Use ThisUse; diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 2f9b94b8d7175..61721ca836321 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -50,6 +50,8 @@ class Type { friend class ConstantArray; // For LLVMTy. friend class ConstantStruct; // For LLVMTy. friend class ConstantVector; // For LLVMTy. + friend class CmpInst; // For LLVMTy. TODO: Cleanup after sandboxir::VectorType + // is more complete. // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index e8d081e6b17e7..c0e5837209213 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2735,6 +2735,16 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr(new PHINode(LLVMPhi, *this)); return It->second.get(); } + case llvm::Instruction::ICmp: { + auto *LLVMICmp = cast(LLVMV); + It->second = std::unique_ptr(new ICmpInst(LLVMICmp, *this)); + return It->second.get(); + } + case llvm::Instruction::FCmp: { + auto *LLVMFCmp = cast(LLVMV); + It->second = std::unique_ptr(new FCmpInst(LLVMFCmp, *this)); + return It->second.get(); + } case llvm::Instruction::Unreachable: { auto *LLVMUnreachable = cast(LLVMV); It->second = std::unique_ptr( @@ -2922,6 +2932,79 @@ PHINode *Context::createPHINode(llvm::PHINode *I) { auto NewPtr = std::unique_ptr(new PHINode(I, *this)); return cast(registerValue(std::move(NewPtr))); } +ICmpInst *Context::createICmpInst(llvm::ICmpInst *I) { + auto NewPtr = std::unique_ptr(new ICmpInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} +FCmpInst *Context::createFCmpInst(llvm::FCmpInst *I) { + auto NewPtr = std::unique_ptr(new FCmpInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} +CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction()); + auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name); + if (dyn_cast(LLVMI)) + return Ctx.createICmpInst(cast(LLVMI)); + return Ctx.createFCmpInst(cast(LLVMI)); +} +CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2, + const Instruction *F, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name) { + CmpInst *Inst = create(P, S1, S2, InsertBefore, Ctx, Name); + cast(Inst->Val)->copyIRFlags(F->Val); + return Inst; +} + +Type *CmpInst::makeCmpResultType(Type *OpndType) { + if (auto *VT = dyn_cast(OpndType)) { + // TODO: Cleanup when we have more complete support for + // sandboxir::VectorType + return OpndType->getContext().getType(llvm::VectorType::get( + llvm::Type::getInt1Ty(OpndType->getContext().LLVMCtx), + cast(VT->LLVMTy)->getElementCount())); + } + return Type::getInt1Ty(OpndType->getContext()); +} + +void CmpInst::setPredicate(Predicate P) { + Ctx.getTracker() + .emplaceIfTracking< + GenericSetter<&CmpInst::getPredicate, &CmpInst::setPredicate>>(this); + cast(Val)->setPredicate(P); +} + +void CmpInst::swapOperands() { + if (ICmpInst *IC = dyn_cast(this)) + IC->swapOperands(); + else + cast(this)->swapOperands(); +} + +void ICmpInst::swapOperands() { + Ctx.getTracker().emplaceIfTracking(this); + cast(Val)->swapOperands(); +} + +void FCmpInst::swapOperands() { + Ctx.getTracker().emplaceIfTracking(this); + cast(Val)->swapOperands(); +} + +#ifndef NDEBUG +void CmpInst::dumpOS(raw_ostream &OS) const { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); +} + +void CmpInst::dump() const { + dumpOS(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG Value *Context::getValue(llvm::Value *V) const { auto It = LLVMValueToValueMap.find(V); diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index 953d4bd51353a..c6eb9fc68a4b1 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -248,6 +248,14 @@ void ShuffleVectorSetMask::dump() const { } #endif +CmpSwapOperands::CmpSwapOperands(CmpInst *Cmp) : Cmp(Cmp) {} + +void CmpSwapOperands::revert(Tracker &Tracker) { Cmp->swapOperands(); } +void CmpSwapOperands::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} + void Tracker::save() { State = TrackerState::Record; } void Tracker::revert() { diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index ca2a183e53268..d1c5690ccad5b 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -4800,6 +4800,184 @@ define void @foo(i32 %arg) { EXPECT_EQ(NewPHI->getNumIncomingValues(), PHI->getNumIncomingValues()); } +static void checkSwapOperands(sandboxir::Context &Ctx, + llvm::sandboxir::CmpInst *Cmp, + llvm::CmpInst *LLVMCmp) { + auto OrigOp0 = Cmp->getOperand(0); + auto OrigOp1 = Cmp->getOperand(1); + EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(0)), OrigOp0); + EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(1)), OrigOp1); + // This checks the dispatch mechanism in CmpInst, as well as + // the specific implementations. + Cmp->swapOperands(); + EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(1)), OrigOp0); + EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(0)), OrigOp1); + EXPECT_EQ(Cmp->getOperand(0), OrigOp1); + EXPECT_EQ(Cmp->getOperand(1), OrigOp0); + // Undo it to keep the rest of the test consistent + Cmp->swapOperands(); +} + +static void checkCommonPredicates(sandboxir::CmpInst *Cmp, + llvm::CmpInst *LLVMCmp) { + // Check proper creation + auto Pred = Cmp->getPredicate(); + auto LLVMPred = LLVMCmp->getPredicate(); + EXPECT_EQ(Pred, LLVMPred); + // Check setPredicate + Cmp->setPredicate(llvm::CmpInst::FCMP_FALSE); + EXPECT_EQ(Cmp->getPredicate(), llvm::CmpInst::FCMP_FALSE); + EXPECT_EQ(LLVMCmp->getPredicate(), llvm::CmpInst::FCMP_FALSE); + Cmp->setPredicate(Pred); + EXPECT_EQ(LLVMCmp->getPredicate(), Pred); + // Ensure the accessors properly forward to the underlying implementation + EXPECT_STREQ(sandboxir::CmpInst::getPredicateName(Pred).data(), + llvm::CmpInst::getPredicateName(LLVMPred).data()); + EXPECT_EQ(Cmp->isFPPredicate(), LLVMCmp->isFPPredicate()); + EXPECT_EQ(Cmp->isIntPredicate(), LLVMCmp->isIntPredicate()); + EXPECT_EQ(Cmp->getInversePredicate(), LLVMCmp->getInversePredicate()); + EXPECT_EQ(Cmp->getOrderedPredicate(), LLVMCmp->getOrderedPredicate()); + EXPECT_EQ(Cmp->getUnorderedPredicate(), LLVMCmp->getUnorderedPredicate()); + EXPECT_EQ(Cmp->getSwappedPredicate(), LLVMCmp->getSwappedPredicate()); + EXPECT_EQ(Cmp->isStrictPredicate(), LLVMCmp->isStrictPredicate()); + EXPECT_EQ(Cmp->isNonStrictPredicate(), LLVMCmp->isNonStrictPredicate()); + EXPECT_EQ(Cmp->isRelational(), LLVMCmp->isRelational()); + if (Cmp->isRelational()) { + EXPECT_EQ(Cmp->getFlippedStrictnessPredicate(), + LLVMCmp->getFlippedStrictnessPredicate()); + } + EXPECT_EQ(Cmp->isCommutative(), LLVMCmp->isCommutative()); + EXPECT_EQ(Cmp->isTrueWhenEqual(), LLVMCmp->isTrueWhenEqual()); + EXPECT_EQ(Cmp->isFalseWhenEqual(), LLVMCmp->isFalseWhenEqual()); + EXPECT_EQ(sandboxir::CmpInst::isOrdered(Pred), + llvm::CmpInst::isOrdered(LLVMPred)); + EXPECT_EQ(sandboxir::CmpInst::isUnordered(Pred), + llvm::CmpInst::isUnordered(LLVMPred)); +} + +TEST_F(SandboxIRTest, ICmpInst) { + SCOPED_TRACE("SandboxIRTest sandboxir::ICmpInst tests"); + parseIR(C, R"IR( +define void @foo(i32 %i0, i32 %i1) { + bb: + %ine = icmp ne i32 %i0, %i1 + %iugt = icmp ugt i32 %i0, %i1 + %iuge = icmp uge i32 %i0, %i1 + %iult = icmp ult i32 %i0, %i1 + %iule = icmp ule i32 %i0, %i1 + %isgt = icmp sgt i32 %i0, %i1 + %isle = icmp sle i32 %i0, %i1 + %ieg = icmp eq i32 %i0, %i1 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + + auto *LLVMBB = getBasicBlockByName(LLVMF, "bb"); + auto LLVMIt = LLVMBB->begin(); + auto *BB = cast(Ctx.getValue(LLVMBB)); + auto It = BB->begin(); + // Check classof() + while (auto *ICmp = dyn_cast(&*It++)) { + auto *LLVMICmp = cast(&*LLVMIt++); + checkSwapOperands(Ctx, ICmp, LLVMICmp); + checkCommonPredicates(ICmp, LLVMICmp); + EXPECT_EQ(ICmp->isSigned(), LLVMICmp->isSigned()); + EXPECT_EQ(ICmp->isUnsigned(), LLVMICmp->isUnsigned()); + EXPECT_EQ(ICmp->getSignedPredicate(), LLVMICmp->getSignedPredicate()); + EXPECT_EQ(ICmp->getUnsignedPredicate(), LLVMICmp->getUnsignedPredicate()); + } + auto *NewCmp = + sandboxir::CmpInst::create(llvm::CmpInst::ICMP_ULE, F.getArg(0), + F.getArg(1), &*BB->begin(), Ctx, "NewCmp"); + EXPECT_EQ(NewCmp, &*BB->begin()); + EXPECT_EQ(NewCmp->getPredicate(), llvm::CmpInst::ICMP_ULE); + EXPECT_EQ(NewCmp->getOperand(0), F.getArg(0)); + EXPECT_EQ(NewCmp->getOperand(1), F.getArg(1)); +#ifndef NDEBUG + EXPECT_EQ(NewCmp->getName(), "NewCmp"); +#endif // NDEBUG + // TODO: Improve this test when sandboxir::VectorType is more completely + // implemented. + sandboxir::Type *RT = + sandboxir::CmpInst::makeCmpResultType(F.getArg(0)->getType()); + EXPECT_TRUE(RT->isIntegerTy(1)); // Only one bit in a single comparison +} + +TEST_F(SandboxIRTest, FCmpInst) { + SCOPED_TRACE("SandboxIRTest sandboxir::FCmpInst tests"); + parseIR(C, R"IR( +define void @foo(float %f0, float %f1) { +bb: + %ffalse = fcmp false float %f0, %f1 + %foeq = fcmp oeq float %f0, %f1 + %fogt = fcmp ogt float %f0, %f1 + %folt = fcmp olt float %f0, %f1 + %fole = fcmp ole float %f0, %f1 + %fone = fcmp one float %f0, %f1 + %ford = fcmp ord float %f0, %f1 + %funo = fcmp uno float %f0, %f1 + %fueq = fcmp ueq float %f0, %f1 + %fugt = fcmp ugt float %f0, %f1 + %fuge = fcmp uge float %f0, %f1 + %fult = fcmp ult float %f0, %f1 + %fule = fcmp ule float %f0, %f1 + %fune = fcmp une float %f0, %f1 + %ftrue = fcmp true float %f0, %f1 + ret void +bb1: + %copyfrom = fadd reassoc float %f0, 42.0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + + auto *LLVMBB = getBasicBlockByName(LLVMF, "bb"); + auto LLVMIt = LLVMBB->begin(); + auto *BB = cast(Ctx.getValue(LLVMBB)); + auto It = BB->begin(); + // Check classof() + while (auto *FCmp = dyn_cast(&*It++)) { + auto *LLVMFCmp = cast(&*LLVMIt++); + checkSwapOperands(Ctx, FCmp, LLVMFCmp); + checkCommonPredicates(FCmp, LLVMFCmp); + } + + auto *LLVMBB1 = getBasicBlockByName(LLVMF, "bb1"); + auto *BB1 = cast(Ctx.getValue(LLVMBB1)); + auto It1 = BB1->begin(); + auto *CopyFrom = &*It1++; + CopyFrom->setFastMathFlags(FastMathFlags::getFast()); + + // create with default flags + auto *NewFCmp = sandboxir::CmpInst::create( + llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), &*It1, Ctx, "NewFCmp"); + EXPECT_EQ(NewFCmp->getPredicate(), llvm::CmpInst::FCMP_ONE); + EXPECT_EQ(NewFCmp->getOperand(0), F.getArg(0)); + EXPECT_EQ(NewFCmp->getOperand(1), F.getArg(1)); +#ifndef NDEBUG + EXPECT_EQ(NewFCmp->getName(), "NewFCmp"); +#endif // NDEBUG + FastMathFlags DefaultFMF = NewFCmp->getFastMathFlags(); + EXPECT_TRUE(CopyFrom->getFastMathFlags() != DefaultFMF); + // create with copied flags + auto *NewFCmpFlags = sandboxir::CmpInst::createWithCopiedFlags( + llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), CopyFrom, &*It1, Ctx, + "NewFCmpFlags"); + EXPECT_FALSE(NewFCmpFlags->getFastMathFlags() != + CopyFrom->getFastMathFlags()); + EXPECT_EQ(NewFCmpFlags->getPredicate(), llvm::CmpInst::FCMP_ONE); + EXPECT_EQ(NewFCmpFlags->getOperand(0), F.getArg(0)); + EXPECT_EQ(NewFCmpFlags->getOperand(1), F.getArg(1)); +#ifndef NDEBUG + EXPECT_EQ(NewFCmpFlags->getName(), "NewFCmpFlags"); +#endif // NDEBUG +} + TEST_F(SandboxIRTest, UnreachableInst) { parseIR(C, R"IR( define void @foo() { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index fe29452a8aea2..a1f39fe958e35 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -1452,6 +1452,49 @@ define void @foo(i8 %arg0, i8 %arg1, i8 %arg2) { EXPECT_EQ(PHI->getIncomingValue(1), Arg1); } +void checkCmpInst(sandboxir::Context &Ctx, sandboxir::CmpInst *Cmp) { + Ctx.save(); + auto OrigP = Cmp->getPredicate(); + auto NewP = Cmp->getSwappedPredicate(); + Cmp->setPredicate(NewP); + EXPECT_EQ(Cmp->getPredicate(), NewP); + Ctx.revert(); + EXPECT_EQ(Cmp->getPredicate(), OrigP); + + Ctx.save(); + auto OrigOp0 = Cmp->getOperand(0); + auto OrigOp1 = Cmp->getOperand(1); + Cmp->swapOperands(); + EXPECT_EQ(Cmp->getPredicate(), NewP); + EXPECT_EQ(Cmp->getOperand(0), OrigOp1); + EXPECT_EQ(Cmp->getOperand(1), OrigOp0); + Ctx.revert(); + EXPECT_EQ(Cmp->getPredicate(), OrigP); + EXPECT_EQ(Cmp->getOperand(0), OrigOp0); + EXPECT_EQ(Cmp->getOperand(1), OrigOp1); +} + +TEST_F(TrackerTest, CmpInst) { + SCOPED_TRACE("TrackerTest sandboxir::CmpInst tests"); + parseIR(C, R"IR( +define void @foo(i64 %i0, i64 %i1, float %f0, float %f1) { + %foeq = fcmp ogt float %f0, %f1 + %ioeq = icmp uge i64 %i0, %i1 + + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *FCmp = cast(&*It++); + checkCmpInst(Ctx, FCmp); + auto *ICmp = cast(&*It++); + checkCmpInst(Ctx, ICmp); +} + TEST_F(TrackerTest, SetVolatile) { parseIR(C, R"IR( define void @foo(ptr %arg0, i8 %val) { From d1e4a2d300f7c0c6b681ddf719132c81d348aaab Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:54:46 -0700 Subject: [PATCH 135/425] [flang] Fix spurious error with separate module procedures (#106768) When the implementation of one SMP apparently references another in what might be a specification expression, semantics may need to resolve it as a forward reference, and to allow for the replacement of a SubprogramNameDetails place-holding symbol with the final SubprogramDetails symbol. Otherwise, as in the bug report below, confusing error messages may result. (The reference in question isn't really in the specification part of a subprogram, but due to the syntactic ambiguity between the array element assignment statement and a statement function definition, it appears to be so at the time that the reference is processed.) I needed to make DumpSymbols() available via SemanticsContext to analyze this bug, and left that new API in place to make things easier next time. Fixes https://github.com/llvm/llvm-project/issues/106705. --- flang/include/flang/Semantics/expression.h | 2 +- flang/include/flang/Semantics/semantics.h | 2 ++ flang/lib/Semantics/expression.cpp | 41 ++++++++++++++-------- flang/lib/Semantics/semantics.cpp | 6 ++-- flang/test/Semantics/smp-proc-ref.f90 | 20 +++++++++++ 5 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 flang/test/Semantics/smp-proc-ref.f90 diff --git a/flang/include/flang/Semantics/expression.h b/flang/include/flang/Semantics/expression.h index a224b08da21da..b1304d704232d 100644 --- a/flang/include/flang/Semantics/expression.h +++ b/flang/include/flang/Semantics/expression.h @@ -354,7 +354,7 @@ class ExpressionAnalyzer { parser::CharBlock, const ProcedureDesignator &, ActualArguments &); using AdjustActuals = std::optional>; - bool ResolveForward(const Symbol &); + const Symbol *ResolveForward(const Symbol &); std::pair ResolveGeneric( const Symbol &, const ActualArguments &, const AdjustActuals &, bool isSubroutine, bool mightBeStructureConstructor = false); diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h index ec8d12b0f9865..e73f9d2e85d58 100644 --- a/flang/include/flang/Semantics/semantics.h +++ b/flang/include/flang/Semantics/semantics.h @@ -257,6 +257,8 @@ class SemanticsContext { void NoteDefinedSymbol(const Symbol &); bool IsSymbolDefined(const Symbol &) const; + void DumpSymbols(llvm::raw_ostream &); + private: struct ScopeIndexComparator { bool operator()(parser::CharBlock, parser::CharBlock) const; diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 4f8632a2055d9..60db02dc764b4 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2650,9 +2650,9 @@ static int ComputeCudaMatchingDistance( // Handles a forward reference to a module function from what must // be a specification expression. Return false if the symbol is // an invalid forward reference. -bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) { +const Symbol *ExpressionAnalyzer::ResolveForward(const Symbol &symbol) { if (context_.HasError(symbol)) { - return false; + return nullptr; } if (const auto *details{ symbol.detailsIf()}) { @@ -2661,8 +2661,13 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) { // checking a specification expression in a sibling module // procedure. Resolve its names now so that its interface // is known. + const semantics::Scope &scope{symbol.owner()}; semantics::ResolveSpecificationParts(context_, symbol); - if (symbol.has()) { + const Symbol *resolved{nullptr}; + if (auto iter{scope.find(symbol.name())}; iter != scope.cend()) { + resolved = &*iter->second; + } + if (!resolved || resolved->has()) { // When the symbol hasn't had its details updated, we must have // already been in the process of resolving the function's // specification part; but recursive function calls are not @@ -2670,8 +2675,8 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) { Say("The module function '%s' may not be referenced recursively in a specification expression"_err_en_US, symbol.name()); context_.SetError(symbol); - return false; } + return resolved; } else if (inStmtFunctionDefinition_) { semantics::ResolveSpecificationParts(context_, symbol); CHECK(symbol.has()); @@ -2679,10 +2684,10 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) { Say("The internal function '%s' may not be referenced in a specification expression"_err_en_US, symbol.name()); context_.SetError(symbol); - return false; + return nullptr; } } - return true; + return &symbol; } // Resolve a call to a generic procedure with given actual arguments. @@ -2709,20 +2714,21 @@ std::pair ExpressionAnalyzer::ResolveGeneric( } if (const auto *details{ultimate.detailsIf()}) { for (const Symbol &specific0 : details->specificProcs()) { - const Symbol &specific{BypassGeneric(specific0)}; - if (isSubroutine != !IsFunction(specific)) { + const Symbol &specific1{BypassGeneric(specific0)}; + if (isSubroutine != !IsFunction(specific1)) { continue; } - if (!ResolveForward(specific)) { + const Symbol *specific{ResolveForward(specific1)}; + if (!specific) { continue; } if (std::optional procedure{ characteristics::Procedure::Characterize( - ProcedureDesignator{specific}, context_.foldingContext(), + ProcedureDesignator{*specific}, context_.foldingContext(), /*emitError=*/false)}) { ActualArguments localActuals{actuals}; - if (specific.has()) { - if (!adjustActuals.value()(specific, localActuals)) { + if (specific->has()) { + if (!adjustActuals.value()(*specific, localActuals)) { continue; } } @@ -2751,9 +2757,9 @@ std::pair ExpressionAnalyzer::ResolveGeneric( } if (!procedure->IsElemental()) { // takes priority over elemental match - nonElemental = &specific; + nonElemental = specific; } else { - elemental = &specific; + elemental = specific; } crtMatchingDistance = ComputeCudaMatchingDistance( context_.languageFeatures(), *procedure, localActuals); @@ -2866,7 +2872,12 @@ auto ExpressionAnalyzer::GetCalleeAndArguments(const parser::Name &name, if (context_.HasError(symbol)) { return std::nullopt; // also handles null symbol } - const Symbol &ultimate{DEREF(symbol).GetUltimate()}; + symbol = ResolveForward(*symbol); + if (!symbol) { + return std::nullopt; + } + name.symbol = const_cast(symbol); + const Symbol &ultimate{symbol->GetUltimate()}; CheckForBadRecursion(name.source, ultimate); bool dueToAmbiguity{false}; bool isGenericInterface{ultimate.has()}; diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp index f7a277d1b414f..8592d1e5d6217 100644 --- a/flang/lib/Semantics/semantics.cpp +++ b/flang/lib/Semantics/semantics.cpp @@ -655,10 +655,12 @@ void Semantics::EmitMessages(llvm::raw_ostream &os) { context_.messages().Emit(os, context_.allCookedSources()); } -void Semantics::DumpSymbols(llvm::raw_ostream &os) { - DoDumpSymbols(os, context_.globalScope()); +void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) { + DoDumpSymbols(os, globalScope()); } +void Semantics::DumpSymbols(llvm::raw_ostream &os) { context_.DumpSymbols(os); } + void Semantics::DumpSymbolsSources(llvm::raw_ostream &os) const { NameToSymbolMap symbols; GetSymbolNames(context_.globalScope(), symbols); diff --git a/flang/test/Semantics/smp-proc-ref.f90 b/flang/test/Semantics/smp-proc-ref.f90 new file mode 100644 index 0000000000000..9a2fae442e8e7 --- /dev/null +++ b/flang/test/Semantics/smp-proc-ref.f90 @@ -0,0 +1,20 @@ +!RUN: %flang_fc1 -fsyntax-only %s +module m + real :: qux(10) + interface + module subroutine bar(i) + end + module function baz() + end + end interface +end + +submodule(m) sm + contains + module procedure bar + qux(i) = baz() ! ensure no bogus error here + end + module procedure baz + baz = 1. + end +end From 1324789a65665c27eda9e04bc93db81cc859924c Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:55:21 -0700 Subject: [PATCH 136/425] [flang][preprocessor] Extend handling of line continuation replacements (#107010) Codes using traditional C preprocessors will sometimes put a keyword macro name in a free form continuation line in order to get macro replacement of part of an identifier, as in call subr_& &N& &(1.) where N is a keyword macro. f18 already handles this case, but not when there is white space between the macro name and the following continuation marker character '&'. Allow white space to appear. Fixes https://github.com/llvm/llvm-project/issues/106931. --- flang/lib/Parser/prescan.cpp | 12 ++++++++---- flang/test/Preprocessing/pp134.F90 | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index a0cd0ff263f92..7dcb61ac79109 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -731,12 +731,16 @@ bool Prescanner::NextToken(TokenSequence &tokens) { // Subtlety: When an identifier is split across three or more continuation // lines (or two continuation lines, immediately preceded or followed // by '&' free form continuation line markers, its parts are kept as - // distinct pp-tokens so that macro operates on them independently. - // This trick accommodates the historic practice of using line - // continuation for token pasting after replacement. + // distinct pp-tokens so that macro replacement operates on them + // independently. This trick accommodates the historic practice of + // using line continuation for token pasting after replacement. } else if (parts == 2) { + if (afterLast && afterLast < limit_) { + afterLast = SkipWhiteSpace(afterLast); + } if ((start > start_ && start[-1] == '&') || - (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) { + (afterLast && afterLast < limit_ && + (*afterLast == '&' || *afterLast == '\n'))) { // call & call foo& call foo& // &MACRO& OR &MACRO& OR &MACRO // &foo(...) &(...) diff --git a/flang/test/Preprocessing/pp134.F90 b/flang/test/Preprocessing/pp134.F90 index bc34767224fa0..213baad900b60 100644 --- a/flang/test/Preprocessing/pp134.F90 +++ b/flang/test/Preprocessing/pp134.F90 @@ -1,7 +1,8 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s ! CHECK: print *, ADC, 1 -! CHECK: print *, AD, 1 -! CHECK: print *, DC, 1 +! CHECK: print *, AD, 2 +! CHECK: print *, DC, 3 +! CHECK: print *, AD(1), 4 ! CHECK: print *, AD ! CHECK: print *, AB #define B D @@ -12,10 +13,13 @@ &C, 1 print *, A& &B& - &, 1 + &, 2 print *, & &B& - &C, 1 + &C, 3 +print *, A& + &B & + &(1), 4 print *, A& &B print *, A& From 61759513c8166a6420ded480802de72859a45499 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 4 Sep 2024 11:08:05 -0700 Subject: [PATCH 137/425] [Analysis] Update getPromotionCandidatesForInstruction description (NFC) (#107277) Updates the description for getPromotionCandidatesForInstruction to reflect the cleanup done in #95624. --- llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h index b9cf048a71043..b6bdfb1275c92 100644 --- a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h +++ b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h @@ -49,9 +49,8 @@ class ICallPromotionAnalysis { /// Returns reference to array of InstrProfValueData for the given /// instruction \p I. /// - /// The \p NumVals, \p TotalCount and \p NumCandidates - /// are set to the number of values in the array, the total profile count - /// of the indirect call \p I, and the number of profitable candidates + /// The \p TotalCount and \p NumCandidates are set to the the total profile + /// count of the indirect call \p I and the number of profitable candidates /// in the given array (which is sorted in reverse order of profitability). /// /// The returned array space is owned by this class, and overwritten on From 6e60330af55bfdf5b34aed4c9197cd3afbf00498 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 4 Sep 2024 11:08:40 -0700 Subject: [PATCH 138/425] [SampleFDO] Read call-graph matching recovered top-level function profile (#101053) With extbinary profile format, initial profile loading only reads profile based on current function names in the module. However, if a function is renamed, sample loader skips to load its original profile(which has a different name), we will miss this case. To address this, we load the top-level profile candidate explicitly for the matching. If a match is found later, the function profile will be further preserved for use by the sample loader. --- .../llvm/ProfileData/SampleProfReader.h | 21 +++ .../Transforms/IPO/SampleProfileMatcher.h | 1 + .../Transforms/IPO/SampleProfileMatcher.cpp | 90 +++++++--- ...seudo-probe-stale-profile-toplev-func.prof | 23 +++ .../pseudo-probe-stale-profile-toplev-func.ll | 169 ++++++++++++++++++ 5 files changed, 278 insertions(+), 26 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index 0fd86600de21f..5301f23def3f3 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -424,6 +424,16 @@ class SampleProfileReader { if (It != Profiles.end()) return &It->second; + if (FuncNameToProfNameMap && !FuncNameToProfNameMap->empty()) { + auto R = FuncNameToProfNameMap->find(FunctionId(Fname)); + if (R != FuncNameToProfNameMap->end()) { + Fname = R->second.stringRef(); + auto It = Profiles.find(FunctionId(Fname)); + if (It != Profiles.end()) + return &It->second; + } + } + if (Remapper) { if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) { auto It = Profiles.find(FunctionId(*NameInProfile)); @@ -505,6 +515,11 @@ class SampleProfileReader { void setModule(const Module *Mod) { M = Mod; } + void setFuncNameToProfNameMap( + const HashKeyMap &FPMap) { + FuncNameToProfNameMap = &FPMap; + } + protected: /// Map every function to its associated profile. /// @@ -541,6 +556,12 @@ class SampleProfileReader { std::unique_ptr Remapper; + // A map pointer to the FuncNameToProfNameMap in SampleProfileLoader, + // which maps the function name to the matched profile name. This is used + // for sample loader to look up profile using the new name. + const HashKeyMap + *FuncNameToProfNameMap = nullptr; + // A map from a function's context hash to its meta data section range, used // for on-demand read function profile metadata. std::unordered_map> diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h index a67f158433391..076d91adfd1de 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h @@ -198,6 +198,7 @@ class SampleProfileMatcher { // function and all inlinees. void countMismatchedCallsiteSamples(const FunctionSamples &FS); void computeAndReportProfileStaleness(); + void UpdateWithSalvagedProfiles(); LocToLocMap &getIRToProfileLocationMap(const Function &F) { auto Ret = FuncMappings.try_emplace( diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index 312672e56b017..0c676e8fb95fd 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -36,6 +36,13 @@ static cl::opt MinCallCountForCGMatching( cl::desc("The minimum number of call anchors required for a function to " "run stale profile call graph matching.")); +static cl::opt LoadFuncProfileforCGMatching( + "load-func-profile-for-cg-matching", cl::Hidden, cl::init(false), + cl::desc( + "Load top-level profiles that the sample reader initially skipped for " + "the call-graph matching (only meaningful for extended binary " + "format)")); + extern cl::opt SalvageStaleProfile; extern cl::opt SalvageUnusedProfile; extern cl::opt PersistProfileStaleness; @@ -410,18 +417,19 @@ void SampleProfileMatcher::runOnFunction(Function &F) { // callsites in one context may differ from those in another context. To get // the maximum number of callsites, we merge the function profiles from all // contexts, aka, the flattened profile to find profile anchors. - const auto *FSFlattened = getFlattenedSamplesFor(F); - if (SalvageUnusedProfile && !FSFlattened) { + const auto *FSForMatching = getFlattenedSamplesFor(F); + if (SalvageUnusedProfile && !FSForMatching) { // Apply the matching in place to find the new function's matched profile. - // TODO: For extended profile format, if a function profile is unused and - // it's top-level, even if the profile is matched, it's not found in the - // profile. This is because sample reader only read the used profile at the - // beginning, we need to support loading the profile on-demand in future. auto R = FuncToProfileNameMap.find(&F); - if (R != FuncToProfileNameMap.end()) - FSFlattened = getFlattenedSamplesFor(R->second); + if (R != FuncToProfileNameMap.end()) { + FSForMatching = getFlattenedSamplesFor(R->second); + // Try to find the salvaged top-level profiles that are explicitly loaded + // for the matching, see "functionMatchesProfileHelper" for the details. + if (!FSForMatching && LoadFuncProfileforCGMatching) + FSForMatching = Reader.getSamplesFor(R->second.stringRef()); + } } - if (!FSFlattened) + if (!FSForMatching) return; // Anchors for IR. It's a map from IR location to callee name, callee name is @@ -432,7 +440,7 @@ void SampleProfileMatcher::runOnFunction(Function &F) { // Anchors for profile. It's a map from callsite location to a set of callee // name. AnchorMap ProfileAnchors; - findProfileAnchors(*FSFlattened, ProfileAnchors); + findProfileAnchors(*FSForMatching, ProfileAnchors); // Compute the callsite match states for profile staleness report. if (ReportProfileStaleness || PersistProfileStaleness) @@ -443,7 +451,7 @@ void SampleProfileMatcher::runOnFunction(Function &F) { // For probe-based profiles, run matching only when profile checksum is // mismatched. bool ChecksumMismatch = FunctionSamples::ProfileIsProbeBased && - !ProbeManager->profileIsValid(F, *FSFlattened); + !ProbeManager->profileIsValid(F, *FSForMatching); bool RunCFGMatching = !FunctionSamples::ProfileIsProbeBased || ChecksumMismatch; bool RunCGMatching = SalvageUnusedProfile; @@ -781,14 +789,30 @@ bool SampleProfileMatcher::functionMatchesProfileHelper( // two sequences are. float Similarity = 0.0; - const auto *FSFlattened = getFlattenedSamplesFor(ProfFunc); - if (!FSFlattened) + const auto *FSForMatching = getFlattenedSamplesFor(ProfFunc); + // With extbinary profile format, initial profile loading only reads profile + // based on current function names in the module. + // However, if a function is renamed, sample loader skips to load its original + // profile(which has a different name), we will miss this case. To address + // this, we load the top-level profile candidate explicitly for the matching. + if (!FSForMatching && LoadFuncProfileforCGMatching) { + DenseSet TopLevelFunc({ProfFunc.stringRef()}); + if (std::error_code EC = Reader.read(TopLevelFunc)) + return false; + FSForMatching = Reader.getSamplesFor(ProfFunc.stringRef()); + LLVM_DEBUG({ + if (FSForMatching) + dbgs() << "Read top-level function " << ProfFunc + << " for call-graph matching\n"; + }); + } + if (!FSForMatching) return false; // The check for similarity or checksum may not be reliable if the function is // tiny, we use the number of basic block as a proxy for the function // complexity and skip the matching if it's too small. if (IRFunc.size() < MinFuncCountForCGMatching || - FSFlattened->getBodySamples().size() < MinFuncCountForCGMatching) + FSForMatching->getBodySamples().size() < MinFuncCountForCGMatching) return false; // For probe-based function, we first trust the checksum info. If the checksum @@ -796,7 +820,7 @@ bool SampleProfileMatcher::functionMatchesProfileHelper( if (FunctionSamples::ProfileIsProbeBased) { const auto *FuncDesc = ProbeManager->getDesc(IRFunc); if (FuncDesc && - !ProbeManager->profileIsHashMismatched(*FuncDesc, *FSFlattened)) { + !ProbeManager->profileIsHashMismatched(*FuncDesc, *FSForMatching)) { LLVM_DEBUG(dbgs() << "The checksums for " << IRFunc.getName() << "(IR) and " << ProfFunc << "(Profile) match.\n"); @@ -807,7 +831,7 @@ bool SampleProfileMatcher::functionMatchesProfileHelper( AnchorMap IRAnchors; findIRAnchors(IRFunc, IRAnchors); AnchorMap ProfileAnchors; - findProfileAnchors(*FSFlattened, ProfileAnchors); + findProfileAnchors(*FSForMatching, ProfileAnchors); AnchorList FilteredIRAnchorsList; AnchorList FilteredProfileAnchorList; @@ -863,6 +887,29 @@ bool SampleProfileMatcher::functionMatchesProfile(Function &IRFunc, return Matched; } +void SampleProfileMatcher::UpdateWithSalvagedProfiles() { + DenseSet ProfileSalvagedFuncs; + // Update FuncNameToProfNameMap and SymbolMap. + for (auto &I : FuncToProfileNameMap) { + assert(I.first && "New function is null"); + FunctionId FuncName(I.first->getName()); + ProfileSalvagedFuncs.insert(I.second.stringRef()); + FuncNameToProfNameMap->emplace(FuncName, I.second); + + // We need to remove the old entry to avoid duplicating the function + // processing. + SymbolMap->erase(FuncName); + SymbolMap->emplace(I.second, I.first); + } + + // With extbinary profile format, initial profile loading only reads profile + // based on current function names in the module, so we need to load top-level + // profiles for functions with different profile name explicitly after + // function-profile name map is established with stale profile matching. + Reader.read(ProfileSalvagedFuncs); + Reader.setFuncNameToProfNameMap(*FuncNameToProfNameMap); +} + void SampleProfileMatcher::runOnModule() { ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles, FunctionSamples::ProfileIsCS); @@ -880,17 +927,8 @@ void SampleProfileMatcher::runOnModule() { runOnFunction(*F); } - // Update the data in SampleLoader. if (SalvageUnusedProfile) - for (auto &I : FuncToProfileNameMap) { - assert(I.first && "New function is null"); - FunctionId FuncName(I.first->getName()); - FuncNameToProfNameMap->emplace(FuncName, I.second); - // We need to remove the old entry to avoid duplicating the function - // processing. - SymbolMap->erase(FuncName); - SymbolMap->emplace(I.second, I.first); - } + UpdateWithSalvagedProfiles(); if (SalvageStaleProfile) distributeIRToProfileLocationMap(); diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof new file mode 100644 index 0000000000000..86c8cb3285afe --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof @@ -0,0 +1,23 @@ +foo:2724522:51 + 1: 51 + 2: 452674 + 3: 47 + 4: 497875 + 6: 415959 + 10: 452623 + 11: 452687 bar:452687 + 12: 452623 + 13: 47 + !CFGChecksum: 281479271677951 +bar:452687:452687 + 1: 452687 + !CFGChecksum: 4294967295 +main:204:0 + 1: 0 + 2: 51 + 3: 0 + 4: 51 + 5: 51 foo:51 + 6: 51 + 7: 0 + !CFGChecksum: 281582264815352 diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll new file mode 100644 index 0000000000000..c839364f23553 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll @@ -0,0 +1,169 @@ +; REQUIRES: x86_64-linux +; REQUIRES: asserts +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-toplev-func.prof --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 --load-func-profile-for-cg-matching 2>&1 | FileCheck %s -check-prefix=CHECK-TEXT +; RUN: llvm-profdata merge --sample %S/Inputs/pseudo-probe-stale-profile-toplev-func.prof -extbinary -o %t.extbinary +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t.extbinary --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 --load-func-profile-for-cg-matching 2>&1 | FileCheck %s -check-prefix=CHECK-EXTBIN + +; CHECK-TEXT: Run stale profile matching for main +; CHECK-TEXT-NOT: Read top-level function foo for call-graph matching +; CHECK-TEXT: The checksums for foo_rename(IR) and foo(Profile) match. +; CHECK-TEXT: Function:foo_rename matches profile:foo +; CHECK-TEXT: Run stale profile matching for foo_rename +; CHECK-TEXT: (1/3) of functions' profile are matched and (2724522/3177413) of samples are reused by call graph matching. + +; CHECK-TEXT: Processing Function main +; CHECK-TEXT: 5: call void @foo_rename(), !dbg ![[#]] - weight: 51 +; CHECK-TEXT: Processing Function foo_rename +; CHECK-TEXT: 2: %call = call i32 @bar(i32 noundef %0), !dbg ![[#]] - weight: 452674 + + +; CHECK-EXTBIN: Run stale profile matching for main +; CHECK-EXTBIN: Read top-level function foo for call-graph matching +; CHECK-EXTBIN: The checksums for foo_rename(IR) and foo(Profile) match. +; CHECK-EXTBIN: Function:foo_rename matches profile:foo +; CHECK-EXTBIN: Run stale profile matching for foo_rename +; CHECK-EXTBIN: (1/3) of functions' profile are matched and (2724522/3177413) of samples are reused by call graph matching. + +; CHECK-EXTBIN: Processing Function main +; CHECK-EXTBIN: 5: call void @foo_rename(), !dbg ![[#]] - weight: 51 +; CHECK-EXTBIN: Processing Function foo_rename +; CHECK-EXTBIN: 2: %call = call i32 @bar(i32 noundef %0), !dbg ![[#]] - weight: 452674 + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@x = dso_local global i32 0, align 4, !dbg !0 + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @bar(i32 noundef %x) #0 !dbg !18 { +entry: + #dbg_value(i32 %x, !22, !DIExpression(), !23) + call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !24 + %add = add nsw i32 %x, 1, !dbg !25 + ret i32 %add, !dbg !26 +} + +; Function Attrs: noinline nounwind uwtable +define dso_local void @foo_rename() #0 !dbg !27 { +entry: + call void @llvm.pseudoprobe(i64 -2115950948644264162, i64 1, i32 0, i64 -1), !dbg !30 + %0 = load volatile i32, ptr @x, align 4, !dbg !30, !tbaa !31 + %call = call i32 @bar(i32 noundef %0), !dbg !35 + %1 = load volatile i32, ptr @x, align 4, !dbg !37, !tbaa !31 + %add = add nsw i32 %1, %call, !dbg !37 + store volatile i32 %add, ptr @x, align 4, !dbg !37, !tbaa !31 + ret void, !dbg !38 +} + +; Function Attrs: nounwind uwtable +define dso_local i32 @main() #1 !dbg !39 { +entry: + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !45 + #dbg_value(i32 0, !43, !DIExpression(), !46) + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ], !dbg !48 + #dbg_value(i32 %i.0, !43, !DIExpression(), !46) + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !49 + %cmp = icmp slt i32 %i.0, 100000, !dbg !51 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !52 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !53 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 7, i32 0, i64 -1), !dbg !54 + ret i32 0, !dbg !54 + +for.body: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !55 + call void @foo_rename(), !dbg !57 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 6, i32 0, i64 -1), !dbg !59 + %inc = add nsw i32 %i.0, 1, !dbg !59 + #dbg_value(i32 %inc, !43, !DIExpression(), !46) + br label %for.cond, !dbg !60, !llvm.loop !61 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3 + +attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" } +attributes #1 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} +!llvm.ident = !{!14} +!llvm.pseudo_probe_desc = !{!15, !16, !17} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 20.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "test_rename.c", directory: "/home", checksumkind: CSK_MD5, checksum: "11a33a83e4d190ebda0792d0610f0c67") +!4 = !{!0} +!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6) +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 5} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{i32 8, !"PIC Level", i32 2} +!11 = !{i32 7, !"PIE Level", i32 2} +!12 = !{i32 7, !"uwtable", i32 2} +!13 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!14 = !{!"clang version 20.0.0"} +!15 = !{i64 -2012135647395072713, i64 4294967295, !"bar"} +!16 = !{i64 -2115950948644264162, i64 281479271677951, !"foo_rename"} +!17 = !{i64 -2624081020897602054, i64 281582264815352, !"main"} +!18 = distinct !DISubprogram(name: "bar", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21) +!19 = !DISubroutineType(types: !20) +!20 = !{!6, !6} +!21 = !{!22} +!22 = !DILocalVariable(name: "x", arg: 1, scope: !18, file: !3, line: 3, type: !6) +!23 = !DILocation(line: 0, scope: !18) +!24 = !DILocation(line: 4, column: 10, scope: !18) +!25 = !DILocation(line: 4, column: 12, scope: !18) +!26 = !DILocation(line: 4, column: 3, scope: !18) +!27 = distinct !DISubprogram(name: "foo_rename", scope: !3, file: !3, line: 7, type: !28, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!28 = !DISubroutineType(types: !29) +!29 = !{null} +!30 = !DILocation(line: 8, column: 15, scope: !27) +!31 = !{!32, !32, i64 0} +!32 = !{!"int", !33, i64 0} +!33 = !{!"omnipotent char", !34, i64 0} +!34 = !{!"Simple C/C++ TBAA"} +!35 = !DILocation(line: 8, column: 11, scope: !36) +!36 = !DILexicalBlockFile(scope: !27, file: !3, discriminator: 455082007) +!37 = !DILocation(line: 8, column: 8, scope: !27) +!38 = !DILocation(line: 9, column: 1, scope: !27) +!39 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !40, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !42) +!40 = !DISubroutineType(types: !41) +!41 = !{!6} +!42 = !{!43} +!43 = !DILocalVariable(name: "i", scope: !44, file: !3, line: 12, type: !6) +!44 = distinct !DILexicalBlock(scope: !39, file: !3, line: 12, column: 3) +!45 = !DILocation(line: 12, column: 12, scope: !44) +!46 = !DILocation(line: 0, scope: !44) +!47 = !DILocation(line: 12, column: 8, scope: !44) +!48 = !DILocation(line: 12, scope: !44) +!49 = !DILocation(line: 12, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !44, file: !3, line: 12, column: 3) +!51 = !DILocation(line: 12, column: 21, scope: !50) +!52 = !DILocation(line: 12, column: 3, scope: !44) +!53 = !DILocation(line: 0, scope: !39) +!54 = !DILocation(line: 15, column: 1, scope: !39) +!55 = !DILocation(line: 13, column: 7, scope: !56) +!56 = distinct !DILexicalBlock(scope: !50, file: !3, line: 12, column: 40) +!57 = !DILocation(line: 13, column: 7, scope: !58) +!58 = !DILexicalBlockFile(scope: !56, file: !3, discriminator: 455082031) +!59 = !DILocation(line: 12, column: 36, scope: !50) +!60 = !DILocation(line: 12, column: 3, scope: !50) +!61 = distinct !{!61, !52, !62, !63} +!62 = !DILocation(line: 14, column: 3, scope: !44) +!63 = !{!"llvm.loop.mustprogress"} From 2e0ded3371f8d42f376bdfd4d70687537e36818e Mon Sep 17 00:00:00 2001 From: R-Goc <131907007+R-Goc@users.noreply.github.com> Date: Wed, 4 Sep 2024 20:10:36 +0200 Subject: [PATCH 139/425] [Windows SEH] Fix crash on empty seh block (#107031) Fixes https://github.com/llvm/llvm-project/issues/105813 and https://github.com/llvm/llvm-project/issues/106915. Adds a check for the end of the iterator, which can be a sentinel. The issue was introduced in https://github.com/llvm/llvm-project/commit/0efe111365ae176671e01252d24028047d807a84 from what I can see, so along with the introduction of /EHa support. --- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 4 ++++ .../CodeGen/WinEH/wineh-empty-seh-scope.ll | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index b37e54d66ddf5..263a213bd4f64 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1476,6 +1476,10 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) { if (BB->getFirstMayFaultInst()) { // Report IP range only for blocks with Faulty inst auto MBBb = MBB.getFirstNonPHI(); + + if (MBBb == MBB.end()) + continue; + MachineInstr *MIb = &*MBBb; if (MIb->isTerminator()) continue; diff --git a/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll b/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll new file mode 100644 index 0000000000000..5f382f10f180b --- /dev/null +++ b/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64-pc-windows-msvc19.41.34120 < %s | FileCheck %s + +define void @foo() personality ptr @__CxxFrameHandler3 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: nop # avoids zero-length function + call void @llvm.seh.scope.begin() + unreachable +} + +declare i32 @__CxxFrameHandler3(...) + +declare void @llvm.seh.scope.begin() + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"eh-asynch", i32 1} From 36c210bb340cfdc68d314dd188e18c0bf017b999 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 11:14:31 -0700 Subject: [PATCH 140/425] [RISCV] Remove pre-assignment of mask vectors during call lowering. NFC (#107192) The first mask vector operand is supposed to be assigned to V0. No other vector types will be assigned to V0. We don't need to pre-assign, we can just try V0 first for any mask vectors in the normal processing. --- .../Target/RISCV/GISel/RISCVCallLowering.cpp | 26 ++-------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 ++++--------------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 9 ++-- 3 files changed, 17 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 6e33032384ede..31a9df53a2aa1 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -35,9 +35,6 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { // Whether this is assigning args for a return. bool IsRet; - // true if assignArg has been called for a mask argument, false otherwise. - bool AssignedFirstMaskArg = false; - public: RISCVOutgoingValueAssigner( RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) @@ -52,16 +49,9 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { const DataLayout &DL = MF.getDataLayout(); const RISCVSubtarget &Subtarget = MF.getSubtarget(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg && - ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) { - FirstMaskArgument = ValNo; - AssignedFirstMaskArg = true; - } - if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed, IsRet, Info.Ty, - *Subtarget.getTargetLowering(), FirstMaskArgument)) + *Subtarget.getTargetLowering())) return true; StackSize = State.getStackSize(); @@ -197,9 +187,6 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { // Whether this is assigning args from a return. bool IsRet; - // true if assignArg has been called for a mask argument, false otherwise. - bool AssignedFirstMaskArg = false; - public: RISCVIncomingValueAssigner( RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) @@ -217,16 +204,9 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { if (LocVT.isScalableVector()) MF.getInfo()->setIsVectorCall(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg && - ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) { - FirstMaskArgument = ValNo; - AssignedFirstMaskArg = true; - } - if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT, LocInfo, Flags, State, /*IsFixed=*/true, IsRet, Info.Ty, - *Subtarget.getTargetLowering(), FirstMaskArgument)) + *Subtarget.getTargetLowering())) return true; StackSize = State.getStackSize(); @@ -483,7 +463,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, MVT VT = MVT::getVT(Outs[I].Ty); if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, /*IsFixed=*/true, - /*isRet=*/true, nullptr, TLI, FirstMaskArgument)) + /*isRet=*/true, nullptr, TLI)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8f95a86ade303..bc661c72e5ecc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19122,17 +19122,16 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, return false; } -static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, - std::optional FirstMaskArgument, - CCState &State, +static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, const RISCVTargetLowering &TLI) { const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); if (RC == &RISCV::VRRegClass) { // Assign the first mask argument to V0. // This is an interim calling convention and it may be changed in the // future. - if (FirstMaskArgument && ValNo == *FirstMaskArgument) - return State.AllocateReg(RISCV::V0); + if (ValVT.getVectorElementType() == MVT::i1) + if (MCRegister Reg = State.AllocateReg(RISCV::V0)) + return Reg; return State.AllocateReg(ArgVRs); } if (RC == &RISCV::VRM2RegClass) @@ -19170,8 +19169,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument) { + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) { unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); assert(XLen == 32 || XLen == 64); MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; @@ -19351,7 +19349,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s); else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) { - Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI); + Reg = allocateRVVReg(ValVT, ValNo, State, TLI); if (!Reg) { // For return values, the vector must be passed fully via registers or // via the stack. @@ -19421,16 +19419,6 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, return false; } -template -static std::optional preAssignMask(const ArgTy &Args) { - for (const auto &ArgIdx : enumerate(Args)) { - MVT ArgVT = ArgIdx.value().VT; - if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1) - return ArgIdx.index(); - } - return std::nullopt; -} - void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet, @@ -19438,10 +19426,6 @@ void RISCVTargetLowering::analyzeInputArgs( unsigned NumArgs = Ins.size(); FunctionType *FType = MF.getFunction().getFunctionType(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Ins); - for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Ins[i].VT; ISD::ArgFlagsTy ArgFlags = Ins[i].Flags; @@ -19454,8 +19438,7 @@ void RISCVTargetLowering::analyzeInputArgs( RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this, - FirstMaskArgument)) { + ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -19469,10 +19452,6 @@ void RISCVTargetLowering::analyzeOutputArgs( CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const { unsigned NumArgs = Outs.size(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Outs); - for (unsigned i = 0; i != NumArgs; i++) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; @@ -19480,8 +19459,7 @@ void RISCVTargetLowering::analyzeOutputArgs( RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this, - FirstMaskArgument)) { + ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -19659,8 +19637,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, - const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument) { + const RISCVTargetLowering &TLI) { if (LocVT == MVT::i32 || LocVT == MVT::i64) { if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); @@ -19744,8 +19721,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, } if (LocVT.isVector()) { - if (MCRegister Reg = - allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) { + if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) { // Fixed-length vectors are located in the corresponding scalable-vector // container types. if (ValVT.isFixedLengthVector()) @@ -20377,17 +20353,13 @@ bool RISCVTargetLowering::CanLowerReturn( SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Outs); - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, - nullptr, *this, FirstMaskArgument)) + nullptr, *this)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f1d1cca043b35..3beee4686956e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -904,8 +904,7 @@ class RISCVTargetLowering : public TargetLowering { CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, - const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + const RISCVTargetLowering &TLI); private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, @@ -1054,14 +1053,12 @@ namespace RISCV { bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, From b30880e975279c1c8ef4c2645eb03063e4b19f2b Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Wed, 4 Sep 2024 11:25:30 -0700 Subject: [PATCH 141/425] [SandboxIR] Fix linking error caused by 840da2e8ba7e0f77938adfc6f6d315137542a1b8 --- llvm/lib/SandboxIR/Tracker.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index c6eb9fc68a4b1..b1f472d7928f4 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -251,10 +251,12 @@ void ShuffleVectorSetMask::dump() const { CmpSwapOperands::CmpSwapOperands(CmpInst *Cmp) : Cmp(Cmp) {} void CmpSwapOperands::revert(Tracker &Tracker) { Cmp->swapOperands(); } +#ifndef NDEBUG void CmpSwapOperands::dump() const { dump(dbgs()); dbgs() << "\n"; } +#endif void Tracker::save() { State = TrackerState::Record; } From a43137c3f85fd87f90c9a8ffaebd71d432018e79 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Wed, 4 Sep 2024 11:46:01 -0700 Subject: [PATCH 142/425] [LLVM][DWARF] Make some effort to avoid duplicates in .debug_ranges. (#106614) Inlining and zero-cost abstractions tend to produce volumes of debug info with identical ranges. When built with full debugging information (the equivalent of -g2) librustc_driver.so has 2.1 million entries in .debug_ranges. But only 1.1 million of those entries are unique. While in principle all duplicates could be eliminated with a hashtable, checking to see if the new range is exactly identical to the previous range and skipping a new addition if it is is sufficient to eliminate 99.99% of the duplicates. This reduces the size of librustc_driver.so's .debug_ranges section by 35%, or the overall binary size a little more than 1%. --- llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp | 16 ++++- llvm/lib/CodeGen/AsmPrinter/DwarfFile.h | 4 ++ .../Generic/debug-ranges-duplication.ll | 70 +++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp index eab798c0da784..cd1279d202132 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -121,7 +121,19 @@ void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) { std::pair DwarfFile::addRange(const DwarfCompileUnit &CU, SmallVector R) { - CURangeLists.push_back( - RangeSpanList{Asm->createTempSymbol("debug_ranges"), &CU, std::move(R)}); + bool CanReuseLastRange = false; + + if (!CURangeLists.empty()) { + auto Last = CURangeLists.back(); + if (Last.CU == &CU && Last.Ranges == R) { + CanReuseLastRange = true; + } + } + + if (!CanReuseLastRange) { + CURangeLists.push_back(RangeSpanList{Asm->createTempSymbol("debug_ranges"), + &CU, std::move(R)}); + } + return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back()); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index f76858fc2f36a..0fc2b91ddfa91 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -37,6 +37,10 @@ class MDNode; struct RangeSpan { const MCSymbol *Begin; const MCSymbol *End; + + bool operator==(const RangeSpan &Other) const { + return Begin == Other.Begin && End == Other.End; + } }; struct RangeSpanList { diff --git a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll new file mode 100644 index 0000000000000..540400be740a5 --- /dev/null +++ b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll @@ -0,0 +1,70 @@ +; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s +; +; Generated from the following C++ source with: +; clang -S -emit-llvm -g -O2 test.c +; +; /* BEGIN SOURCE */ +; void f1(); +; inline void f2() { +; f1(); +; f1(); +; } +; inline void f3() { +; f2(); +; } +; void f4() { +; f3(); +; f1(); +; } +; /* END SOURCE */ +; +; Minor complication: after generating the LLVM IR, it was manually edited so +; that the 'f1()' call from f3 was reordered to appear between the two inlined +; f1 calls from f2. This causes f2's inlined_subroutine to use DW_AT_ranges. + +; Check that identical debug ranges in succession reuse the same entry in +; .debug_ranges rather than emitting duplicate entries. + +; CHECK: DW_TAG_inlined_subroutine +; CHECK: DW_AT_ranges +; CHECK-SAME: rangelist = 0x[[#%.8X,RANGE:]] +; CHECK: DW_TAG_inlined_subroutine +; CHECK: DW_AT_ranges +; CHECK-SAME: rangelist = 0x[[#RANGE]] + +; Function Attrs: nounwind uwtable +define dso_local void @f4() local_unnamed_addr !dbg !9 { +entry: + tail call void (...) @f1(), !dbg !12 + tail call void (...) @f1(), !dbg !18 + tail call void (...) @f1(), !dbg !17 + ret void, !dbg !19 +} + +declare !dbg !20 void @f1(...) local_unnamed_addr + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 9edd998e10fabfff067b9e6e5b044f85a24d0dd5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/home/khuey/dev/llvm-project", checksumkind: CSK_MD5, checksum: "4510feb241cf078af753e3dc13205127") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{!"clang version 20.0.0git (https://github.com/llvm/llvm-project.git 9edd998e10fabfff067b9e6e5b044f85a24d0dd5)"} +!9 = distinct !DISubprogram(name: "f4", scope: !1, file: !1, line: 9, type: !10, scopeLine: 9, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DISubroutineType(types: !11) +!11 = !{null} +!12 = !DILocation(line: 3, column: 3, scope: !13, inlinedAt: !14) +!13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 2, type: !10, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!14 = distinct !DILocation(line: 7, column: 3, scope: !15, inlinedAt: !16) +!15 = distinct !DISubprogram(name: "f3", scope: !1, file: !1, line: 6, type: !10, scopeLine: 6, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!16 = distinct !DILocation(line: 10, column: 3, scope: !9) +!17 = !DILocation(line: 4, column: 3, scope: !13, inlinedAt: !14) +!18 = !DILocation(line: 11, column: 3, scope: !9) +!19 = !DILocation(line: 12, column: 1, scope: !9) +!20 = !DISubprogram(name: "f1", scope: !1, file: !1, line: 1, type: !10, spFlags: DISPFlagOptimized) From c1667f909949d15c593e4a03a4e992cffa72ad3c Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 4 Sep 2024 15:06:27 -0400 Subject: [PATCH 143/425] Fix `transpose->unpack` folding pattern for the partial-tile case of `unpack` (#107271) Just directly create the empty tensor of appropriate shape instead of relying on `UnPackOp::createDestinationTensor` which is trying to infer the destination shape, which isn't possible in general with the set of paramters that it is taking. Signed-off-by: Benoit Jacob --- .../Transforms/PackAndUnpackPatterns.cpp | 13 ++++--- .../Tensor/fold-into-pack-and-unpack.mlir | 35 +++++++++++++++---- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp index c681cadcb27cb..995486c87771a 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp @@ -439,6 +439,11 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp if (failed(maybePerm)) return failure(); + SmallVector> unpackOpResultDims; + if (failed(reifyResultShapes(rewriter, unPackOp, unpackOpResultDims))) { + return failure(); + } + SmallVector inverseTransposePerm = invertPermutationVector(maybePerm.value()); auto outerDimsPerm = unPackOp.getOuterDimsPerm(); @@ -448,7 +453,6 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp SmallVector newOuterDimsPermVec; SmallVector newInnerDimsPosVec; SmallVector newMixedInnerTilesVec; - if (!checkAndPermute(inverseTransposePerm, outerDimsPerm, newOuterDimsPermVec, destRank)) return rewriter.notifyMatchFailure( @@ -463,9 +467,10 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp newInnerDimsPosVec.push_back(innerDimsPos[remappedPosition]); } - Value output = unPackOp.createDestinationTensor( - rewriter, unPackOp.getLoc(), linalgOp->getOperand(0), - newMixedInnerTilesVec, newInnerDimsPosVec, newOuterDimsPermVec); + auto elemType = + cast(unPackOp->getResultTypes()[0]).getElementType(); + Value output = rewriter.create( + unPackOp->getLoc(), unpackOpResultDims[0], elemType); rewriter.replaceOpWithNewOp( unPackOp, linalgOp->getOperand(0), output, newInnerDimsPosVec, diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir index 629a4c2135720..bff913f5f55fe 100644 --- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir +++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir @@ -550,6 +550,32 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t // ----- +func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { + %0 = tensor.empty() : tensor<1x1x16x4xi32> + %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) + outs(%0 : tensor<1x1x16x4xi32>) + permutation = [1, 0, 3, 2] + %1 = tensor.empty() : tensor<15x3xi32> + %unpack = tensor.unpack %transposed + outer_dims_perm = [0, 1] + inner_dims_pos = [0, 1] + inner_tiles = [16, 4] into + %1 : tensor<1x1x16x4xi32> -> tensor<15x3xi32> + return %unpack : tensor<15x3xi32> +} +//CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_partial_tile( +// CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { +// CHECK: %[[OUT:.+]] = tensor.empty() : tensor<15x3xi32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK-SAME: outer_dims_perm = [1, 0] +// CHECK-SAME: inner_dims_pos = [1, 0] +// CHECK-SAME: inner_tiles = [4, 16] +// CHECK-SAME: into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<15x3xi32> +// CHECK: return %[[UNPACK]] : tensor<15x3xi32> +// CHECK: } + +// ----- + func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %unpack_dest: tensor, %tile_p : index, %tile_q : index) -> tensor { %transposed = linalg.transpose ins(%arg0 : tensor) @@ -563,17 +589,14 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile into %unpack_dest : tensor -> tensor return %unpack : tensor } -// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)> // CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor, // CHECK-SAME: %[[IDX1:.+]]: index, %[[IDX2:.+]]: index) -> tensor { // CHECK-DAG: %[[CST1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[CST0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[CST0]] : tensor -// CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[CST1]] : tensor -// CHECK-DAG: %[[AMAP0:.+]] = affine.apply #[[$MAP]]()[%[[DIM1]], %[[IDX2]]] -// CHECK-DAG: %[[AMAP1:.+]] = affine.apply #[[$MAP]]()[%[[DIM0]], %[[IDX1]]] -// CHECK: %[[OUT:.+]] = tensor.empty(%[[AMAP1]], %[[AMAP0]]) : tensor +// CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[CST0]] : tensor +// CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[CST1]] : tensor +// CHECK: %[[OUT:.+]] = tensor.empty(%[[DIM0]], %[[DIM1]]) : tensor // CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1] // CHECK-SAME: inner_dims_pos = [1, 0] From ebf0599314e17c3ab89f303d452811b1db3e6d1e Mon Sep 17 00:00:00 2001 From: SJW <48454132+sjw36@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:24:58 -0500 Subject: [PATCH 144/425] [MLIR][SCF] Add support for loop pipeline peeling for dynamic loops. (#106436) Allow speculative execution and predicate results per stage. --- .../Dialect/SCF/Transforms/LoopPipelining.cpp | 116 ++++++++++++------ mlir/test/Dialect/SCF/loop-pipelining.mlir | 103 +++++++++++++++- mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp | 4 +- 3 files changed, 179 insertions(+), 44 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp index d8e1cc0ecef88..a34542f0161ac 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp @@ -94,8 +94,8 @@ struct LoopPipelinerInternal { RewriterBase &rewriter); /// Emits the epilogue, this creates `maxStage - 1` part which will contain /// operations from stages [i; maxStage], where i is the part index. - void emitEpilogue(RewriterBase &rewriter, - llvm::SmallVector &returnValues); + LogicalResult emitEpilogue(RewriterBase &rewriter, + llvm::SmallVector &returnValues); }; bool LoopPipelinerInternal::initializeLoopInfo( @@ -133,10 +133,6 @@ bool LoopPipelinerInternal::initializeLoopInfo( LDBG("--no epilogue or predicate set -> BAIL"); return false; } - if (dynamicLoop && peelEpilogue) { - LDBG("--dynamic loop doesn't support epilogue yet -> BAIL"); - return false; - } std::vector> schedule; options.getScheduleFn(forOp, schedule); if (schedule.empty()) { @@ -313,10 +309,10 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { }); int predicateIdx = i - stages[op]; if (predicates[predicateIdx]) { + OpBuilder::InsertionGuard insertGuard(rewriter); newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]); assert(newOp && "failed to predicate op."); } - rewriter.setInsertionPointAfter(newOp); if (annotateFn) annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { @@ -561,6 +557,7 @@ LogicalResult LoopPipelinerInternal::createKernel( } if (predicates[useStage]) { + OpBuilder::InsertionGuard insertGuard(rewriter); newOp = predicateFn(rewriter, newOp, predicates[useStage]); if (!newOp) return failure(); @@ -568,7 +565,6 @@ LogicalResult LoopPipelinerInternal::createKernel( for (auto values : llvm::zip(op->getResults(), newOp->getResults())) mapping.map(std::get<0>(values), std::get<1>(values)); } - rewriter.setInsertionPointAfter(newOp); if (annotateFn) annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0); } @@ -640,70 +636,113 @@ LogicalResult LoopPipelinerInternal::createKernel( return success(); } -void LoopPipelinerInternal::emitEpilogue( - RewriterBase &rewriter, llvm::SmallVector &returnValues) { +LogicalResult +LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter, + llvm::SmallVector &returnValues) { + Location loc = forOp.getLoc(); // Emit different versions of the induction variable. They will be // removed by dead code if not used. + + // bounds_range = ub - lb + // total_iterations = (bounds_range + step - 1) / step + Type t = lb.getType(); + Value minus1 = + rewriter.create(loc, rewriter.getIntegerAttr(t, -1)); + Value boundsRange = rewriter.create(loc, ub, lb); + Value rangeIncr = rewriter.create(loc, boundsRange, step); + Value rangeDecr = rewriter.create(loc, rangeIncr, minus1); + Value totalIterations = rewriter.create(loc, rangeDecr, step); + + SmallVector predicates(maxStage + 1); for (int64_t i = 0; i < maxStage; i++) { - Location loc = forOp.getLoc(); - Type t = lb.getType(); - Value minusOne = - rewriter.create(loc, rewriter.getIntegerAttr(t, -1)); - // number of iterations = ((ub - 1) - lb) / step - Value totalNumIteration = rewriter.create( - loc, - rewriter.create( - loc, rewriter.create(loc, ub, minusOne), lb), - step); - // newLastIter = lb + step * ((((ub - 1) - lb) / step) - i) + // iterI = total_iters - 1 - i + // May go negative... Value minusI = rewriter.create(loc, rewriter.getIntegerAttr(t, -i)); + Value iterI = rewriter.create( + loc, rewriter.create(loc, totalIterations, minus1), + minusI); + // newLastIter = lb + step * iterI Value newlastIter = rewriter.create( - loc, lb, - rewriter.create( - loc, step, - rewriter.create(loc, totalNumIteration, minusI))); + loc, lb, rewriter.create(loc, step, iterI)); + setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); + + if (dynamicLoop) { + // pred = iterI >= lb + predicates[i + 1] = rewriter.create( + loc, arith::CmpIPredicate::sge, iterI, lb); + } } + // Emit `maxStage - 1` epilogue part that includes operations from stages // [i; maxStage]. for (int64_t i = 1; i <= maxStage; i++) { + SmallVector> returnMap(returnValues.size()); for (Operation *op : opOrder) { if (stages[op] < i) continue; + unsigned currentVersion = maxStage - stages[op] + i; + unsigned nextVersion = currentVersion + 1; Operation *newOp = cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) { auto it = valueMapping.find(newOperand->get()); if (it != valueMapping.end()) { - Value replacement = it->second[maxStage - stages[op] + i]; + Value replacement = it->second[currentVersion]; newOperand->set(replacement); } }); + if (dynamicLoop) { + OpBuilder::InsertionGuard insertGuard(rewriter); + newOp = predicateFn(rewriter, newOp, predicates[currentVersion]); + if (!newOp) + return failure(); + } if (annotateFn) annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1); - for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { - setValueMapping(op->getResult(destId), newOp->getResult(destId), - maxStage - stages[op] + i); + + for (auto [opRes, newRes] : + llvm::zip(op->getResults(), newOp->getResults())) { + setValueMapping(opRes, newRes, currentVersion); // If the value is a loop carried dependency update the loop argument // mapping and keep track of the last version to replace the original // forOp uses. for (OpOperand &operand : forOp.getBody()->getTerminator()->getOpOperands()) { - if (operand.get() != op->getResult(destId)) + if (operand.get() != opRes) continue; - unsigned version = maxStage - stages[op] + i + 1; // If the version is greater than maxStage it means it maps to the // original forOp returned value. - if (version > maxStage) { - returnValues[operand.getOperandNumber()] = newOp->getResult(destId); - continue; - } - setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], - newOp->getResult(destId), version); + unsigned ri = operand.getOperandNumber(); + returnValues[ri] = newRes; + Value mapVal = forOp.getRegionIterArgs()[ri]; + returnMap[ri] = std::make_pair(mapVal, currentVersion); + if (nextVersion <= maxStage) + setValueMapping(mapVal, newRes, nextVersion); + } + } + } + if (dynamicLoop) { + // Select return values from this stage (live outs) based on predication. + // If the stage is valid select the peeled value, else use previous stage + // value. + for (auto pair : llvm::enumerate(returnValues)) { + unsigned ri = pair.index(); + auto [mapVal, currentVersion] = returnMap[ri]; + if (mapVal) { + unsigned nextVersion = currentVersion + 1; + Value pred = predicates[currentVersion]; + Value prevValue = valueMapping[mapVal][currentVersion]; + auto selOp = rewriter.create(loc, pred, pair.value(), + prevValue); + returnValues[ri] = selOp; + if (nextVersion <= maxStage) + setValueMapping(mapVal, selOp, nextVersion); } } } } + return success(); } void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { @@ -760,7 +799,8 @@ FailureOr mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp, if (options.peelEpilogue) { // 4. Emit the epilogue after the new forOp. rewriter.setInsertionPointAfter(newForOp); - pipeliner.emitEpilogue(rewriter, returnValues); + if (failed(pipeliner.emitEpilogue(rewriter, returnValues))) + return failure(); } // 5. Erase the original loop and replace the uses with the epilogue output. if (forOp->getNumResults() > 0) diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir index 9687f80f5ddfc..4a1406faabce1 100644 --- a/mlir/test/Dialect/SCF/loop-pipelining.mlir +++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir @@ -764,11 +764,44 @@ func.func @stage_0_value_escape(%A: memref, %result: memref, %ub: // NOEPILOGUE: memref.load %[[A]][%[[IV3]]] : memref // NOEPILOGUE: scf.yield %[[V2]], %[[L3]] : f32, f32 -// In case dynamic loop pipelining is off check that the transformation didn't -// apply. +// Check for predicated epilogue for dynamic loop. // CHECK-LABEL: dynamic_loop( -// CHECK-NOT: memref.load -// CHECK: scf.for +// CHECK: %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) +// CHECK: memref.store %[[ARG6]], %{{.*}}[%[[ARG5]]] +// CHECK: %[[ADDF_24:.*]] = arith.addf %[[ARG7]], %{{.*}} +// CHECK: %[[MULI_25:.*]] = arith.muli %{{.*}}, %{{.*}} +// CHECK: %[[ADDI_26:.*]] = arith.addi %[[ARG5]], %[[MULI_25]] +// CHECK: %[[LOAD_27:.*]] = memref.load %{{.*}}[%[[ADDI_26]]] +// CHECK: scf.yield %[[ADDF_24]], %[[LOAD_27]] +// CHECK: } +// CHECK: %[[SUBI_10:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %{{.*}} +// CHECK: %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %{{.*}}-1 +// CHECK: %[[DIVUI_13:.*]] = arith.divui %[[ADDI_12]], %{{.*}} +// CHECK: %[[ADDI_14:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1 +// CHECK: %[[MULI_15:.*]] = arith.muli %{{.*}}, %[[ADDI_14]] +// CHECK: %[[ADDI_16:.*]] = arith.addi %{{.*}}, %[[MULI_15]] +// CHECK: %[[CMPI_17:.*]] = arith.cmpi sge, %[[ADDI_14]], %{{.*}} +// CHECK: %[[ADDI_18:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1 +// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ADDI_18]], %{{.*}}-1 +// CHECK: %[[MULI_20:.*]] = arith.muli %{{.*}}, %[[ADDI_19]] +// CHECK: %[[ADDI_21:.*]] = arith.addi %{{.*}}, %[[MULI_20]] +// CHECK: %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_19]], %{{.*}} +// CHECK: scf.if %[[CMPI_17]] { +// CHECK: memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_21]]] +// CHECK: } else { +// CHECK: } +// CHECK: %[[IF_23:.*]] = scf.if %[[CMPI_22]] -> (f32) { +// CHECK: %[[ADDF_24:.*]] = arith.addf %{{.*}}#1, %{{.*}} +// CHECK: scf.yield %[[ADDF_24]] +// CHECK: } else { +// CHECK: scf.yield %{{.*}} +// CHECK: } +// CHECK: scf.if %[[CMPI_22]] { +// CHECK: memref.store %[[IF_23]], %{{.*}}[%[[ADDI_16]]] +// CHECK: } else { +// CHECK: } +// CHECK: return func.func @dynamic_loop(%A: memref, %result: memref, %lb: index, %ub: index, %step: index) { %cf = arith.constant 1.0 : f32 scf.for %i0 = %lb to %ub step %step { @@ -781,6 +814,68 @@ func.func @dynamic_loop(%A: memref, %result: memref, %lb: index, % // ----- +// NOEPILOGUE-LABEL: func.func @dynamic_loop_result +// NOEPILOGUE: %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) +// NOEPILOGUE: %[[SUBI_3:.*]] = arith.subi %{{.*}}, %{{.*}} +// NOEPILOGUE: %[[CMPI_4:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_3]] +// NOEPILOGUE: %[[ADDF_5:.*]] = arith.addf %[[ARG7]], %[[ARG6]] +// NOEPILOGUE: %[[MULF_6:.*]] = arith.mulf %[[ADDF_5]], %{{.*}} +// NOEPILOGUE: %[[ADDI_7:.*]] = arith.addi %[[ARG5]], %{{.*}} +// NOEPILOGUE: %[[IF_8:.*]] = scf.if %[[CMPI_4]] +// NOEPILOGUE: %[[LOAD_9:.*]] = memref.load %{{.*}}[%[[ADDI_7]]] +// NOEPILOGUE: scf.yield %[[LOAD_9]] +// NOEPILOGUE: } else { +// NOEPILOGUE: scf.yield %{{.*}} +// NOEPILOGUE: } +// NOEPILOGUE: scf.yield %[[MULF_6]], %[[IF_8]] +// NOEPILOGUE: } +// NOEPILOGUE: memref.store %{{.*}}#0, %{{.*}}[%{{.*}}] + +// Check for predicated epilogue for dynamic loop. +// CHECK-LABEL: func.func @dynamic_loop_result +// CHECK: %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) +// CHECK: %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]] +// CHECK: %[[MULF_14:.*]] = arith.mulf %[[ADDF_13]], %{{.*}} +// CHECK: %[[ADDI_15:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[LOAD_16:.*]] = memref.load %{{.*}}[%[[ADDI_15]]] +// CHECK: scf.yield %[[MULF_14]], %[[LOAD_16]] +// CHECK: } +// CHECK: %[[SUBI_4:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[ADDI_5:.*]] = arith.addi %[[SUBI_4]], %{{.*}} +// CHECK: %[[ADDI_6:.*]] = arith.addi %[[ADDI_5]], %{{.*}}-1 +// CHECK: %[[DIVUI_7:.*]] = arith.divui %[[ADDI_6]], %{{.*}} +// CHECK: %[[ADDI_8:.*]] = arith.addi %[[DIVUI_7]], %{{.*}}-1 +// CHECK: %[[CMPI_9:.*]] = arith.cmpi sge, %[[ADDI_8]], %{{.*}} +// CHECK: %[[IF_10:.*]] = scf.if %[[CMPI_9]] +// CHECK: %[[ADDF_13:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0 +// CHECK: scf.yield %[[ADDF_13]] +// CHECK: } else { +// CHECK: scf.yield %{{.*}} +// CHECK: } +// CHECK: %[[IF_11:.*]] = scf.if %[[CMPI_9]] +// CHECK: %[[MULF_13:.*]] = arith.mulf %[[IF_10]], %{{.*}} +// CHECK: scf.yield %[[MULF_13]] +// CHECK: } else { +// CHECK: scf.yield %{{.*}} +// CHECK: } +// CHECK: %[[SELECT_12:.*]] = arith.select %[[CMPI_9]], %[[IF_11]], %{{.*}}#0 +// CHECK: memref.store %[[SELECT_12]], %{{.*}}[%{{.*}}] +func.func @dynamic_loop_result(%A: memref, %result: memref, %lb: index, %ub: index, %step: index) { + %cf0 = arith.constant 1.0 : f32 + %cf1 = arith.constant 33.0 : f32 + %cst = arith.constant 0 : index + %res:1 = scf.for %i0 = %lb to %ub step %step iter_args (%arg0 = %cf0) -> (f32) { + %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref + %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32 + %A2_elem = arith.mulf %A1_elem, %cf1 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32 + scf.yield %A2_elem : f32 + } { __test_pipelining_loop__ } + memref.store %res#0, %result[%cst] : memref + return +} + +// ----- + // CHECK-LABEL: yield_constant_loop( // CHECK-SAME: %[[A:.*]]: memref) -> f32 { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index diff --git a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp index 8a92d840ad130..3ff7f9966e93d 100644 --- a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp +++ b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp @@ -214,12 +214,12 @@ struct TestSCFPipeliningPass RewritePatternSet patterns(&getContext()); mlir::scf::PipeliningOption options; options.getScheduleFn = getSchedule; + options.supportDynamicLoops = true; + options.predicateFn = predicateOp; if (annotatePipeline) options.annotateFn = annotate; if (noEpiloguePeeling) { - options.supportDynamicLoops = true; options.peelEpilogue = false; - options.predicateFn = predicateOp; } scf::populateSCFLoopPipeliningPatterns(patterns, options); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); From 0fffdeb5f46078ddcc61e112cd38856b1165f050 Mon Sep 17 00:00:00 2001 From: Ziqing Luo Date: Wed, 4 Sep 2024 12:34:43 -0700 Subject: [PATCH 145/425] [-Wunsafe-buffer-usage] Warning Libc functions (#101583) [-Wunsafe-buffer-usage] Add warn on unsafe calls to libc functions Warning about calls to libc functions involving buffer access. Warned functions are hardcoded by names. (rdar://117182250) --- .../Analysis/Analyses/UnsafeBufferUsage.h | 15 + .../Analyses/UnsafeBufferUsageGadgets.def | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 7 + clang/lib/Analysis/UnsafeBufferUsage.cpp | 513 +++++++++++++++++- clang/lib/Sema/AnalysisBasedWarnings.cpp | 14 + ...-usage-libc-functions-inline-namespace.cpp | 60 ++ ...arn-unsafe-buffer-usage-libc-functions.cpp | 106 ++++ ...n-unsafe-buffer-usage-test-unreachable.cpp | 4 +- 8 files changed, 716 insertions(+), 4 deletions(-) create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h index 228b4ae1e3e11..aa2c01ad10d45 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h @@ -15,6 +15,7 @@ #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H #include "clang/AST/Decl.h" +#include "clang/AST/Expr.h" #include "clang/AST/Stmt.h" #include "clang/Basic/SourceLocation.h" #include "llvm/Support/Debug.h" @@ -106,6 +107,20 @@ class UnsafeBufferUsageHandler { virtual void handleUnsafeOperation(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) = 0; + /// Invoked when a call to an unsafe libc function is found. + /// \param PrintfInfo + /// is 0 if the callee function is not a member of the printf family; + /// is 1 if the callee is `sprintf`; + /// is 2 if arguments of the call have `__size_by` relation but are not in a + /// safe pattern; + /// is 3 if string arguments do not guarantee null-termination + /// is 4 if the callee takes va_list + /// \param UnsafeArg one of the actual arguments that is unsafe, non-null + /// only when `2 <= PrintfInfo <= 3` + virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, + ASTContext &Ctx, + const Expr *UnsafeArg = nullptr) = 0; + /// Invoked when an unsafe operation with a std container is found. virtual void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def index 242ad763ba62b..ac01b285ae833 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def @@ -38,6 +38,7 @@ WARNING_GADGET(PointerArithmetic) WARNING_GADGET(UnsafeBufferUsageAttr) WARNING_GADGET(UnsafeBufferUsageCtorAttr) WARNING_GADGET(DataInvocation) +WARNING_GADGET(UnsafeLibcFunctionCall) WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)` FIXABLE_GADGET(ULCArraySubscript) // `DRE[any]` in an Unspecified Lvalue Context FIXABLE_GADGET(DerefSimplePtrArithFixable) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dcb49d8a67604..35f68f51dfb35 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12412,6 +12412,13 @@ def warn_unsafe_buffer_operation : Warning< "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|" "field %1 prone to unsafe buffer manipulation}0">, InGroup, DefaultIgnore; +def warn_unsafe_buffer_libc_call : Warning< + "function %0 is unsafe">, + InGroup, DefaultIgnore; +def note_unsafe_buffer_printf_call : Note< + "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match" + "|string argument is not guaranteed to be null-terminated" + "|'va_list' is unsafe}0">; def note_unsafe_buffer_operation : Note< "used%select{| in pointer arithmetic| in buffer access}0 here">; def note_unsafe_buffer_variable_fixit_group : Note< diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index da7446913f7c8..f0d072643f8ff 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -10,12 +10,12 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/FormatString.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Stmt.h" #include "clang/AST/StmtVisitor.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" -#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" @@ -443,6 +443,426 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) { return false; } +AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) { + return Node.getNumArgs() == Num; +} + +namespace libc_func_matchers { +// Under `libc_func_matchers`, define a set of matchers that match unsafe +// functions in libc and unsafe calls to them. + +// A tiny parser to strip off common prefix and suffix of libc function names +// in real code. +// +// Given a function name, `matchName` returns `CoreName` according to the +// following grammar: +// +// LibcName := CoreName | CoreName + "_s" +// MatchingName := "__builtin_" + LibcName | +// "__builtin___" + LibcName + "_chk" | +// "__asan_" + LibcName +// +struct LibcFunNamePrefixSuffixParser { + StringRef matchName(StringRef FunName, bool isBuiltin) { + // Try to match __builtin_: + if (isBuiltin && FunName.starts_with("__builtin_")) + // Then either it is __builtin_LibcName or __builtin___LibcName_chk or + // no match: + return matchLibcNameOrBuiltinChk( + FunName.drop_front(10 /* truncate "__builtin_" */)); + // Try to match __asan_: + if (FunName.starts_with("__asan_")) + return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */)); + return matchLibcName(FunName); + } + + // Parameter `Name` is the substring after stripping off the prefix + // "__builtin_". + StringRef matchLibcNameOrBuiltinChk(StringRef Name) { + if (Name.starts_with("__") && Name.ends_with("_chk")) + return matchLibcName( + Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */); + return matchLibcName(Name); + } + + StringRef matchLibcName(StringRef Name) { + if (Name.ends_with("_s")) + return Name.drop_back(2 /* truncate "_s" */); + return Name; + } +}; + +// A pointer type expression is known to be null-terminated, if it has the +// form: E.c_str(), for any expression E of `std::string` type. +static bool isNullTermPointer(const Expr *Ptr) { + if (isa(Ptr->IgnoreParenImpCasts())) + return true; + if (isa(Ptr->IgnoreParenImpCasts())) + return true; + if (auto *MCE = dyn_cast(Ptr->IgnoreParenImpCasts())) { + const CXXMethodDecl *MD = MCE->getMethodDecl(); + const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl(); + + if (MD && RD && RD->isInStdNamespace()) + if (MD->getName() == "c_str" && RD->getName() == "basic_string") + return true; + } + return false; +} + +// Return true iff at least one of following cases holds: +// 1. Format string is a literal and there is an unsafe pointer argument +// corresponding to an `s` specifier; +// 2. Format string is not a literal and there is least an unsafe pointer +// argument (including the formatter argument). +// +// `UnsafeArg` is the output argument that will be set only if this function +// returns true. +static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg, + const unsigned FmtArgIdx, ASTContext &Ctx, + bool isKprintf = false) { + class StringFormatStringHandler + : public analyze_format_string::FormatStringHandler { + const CallExpr *Call; + unsigned FmtArgIdx; + const Expr *&UnsafeArg; + + public: + StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx, + const Expr *&UnsafeArg) + : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {} + + bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS, + const char *startSpecifier, + unsigned specifierLen, + const TargetInfo &Target) override { + if (FS.getConversionSpecifier().getKind() == + analyze_printf::PrintfConversionSpecifier::sArg) { + unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx; + + if (0 < ArgIdx && ArgIdx < Call->getNumArgs()) + if (!isNullTermPointer(Call->getArg(ArgIdx))) { + UnsafeArg = Call->getArg(ArgIdx); // output + // returning false stops parsing immediately + return false; + } + } + return true; // continue parsing + } + }; + + const Expr *Fmt = Call->getArg(FmtArgIdx); + + if (auto *SL = dyn_cast(Fmt->IgnoreParenImpCasts())) { + StringRef FmtStr = SL->getString(); + StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg); + + return analyze_format_string::ParsePrintfString( + Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(), + Ctx.getTargetInfo(), isKprintf); + } + // If format is not a string literal, we cannot analyze the format string. + // In this case, this call is considered unsafe if at least one argument + // (including the format argument) is unsafe pointer. + return llvm::any_of( + llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()), + [&UnsafeArg](const Expr *Arg) -> bool { + if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) { + UnsafeArg = Arg; + return true; + } + return false; + }); +} + +// Matches a FunctionDecl node such that +// 1. It's name, after stripping off predefined prefix and suffix, is +// `CoreName`; and +// 2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which +// is a set of libc function names. +// +// Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`. +// The notation `CoreName[str/wcs]` means a new name obtained from replace +// string "wcs" with "str" in `CoreName`. +AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) { + static std::unique_ptr> PredefinedNames = nullptr; + if (!PredefinedNames) + PredefinedNames = + std::make_unique, std::set>({ + // numeric conversion: + "atof", + "atoi", + "atol", + "atoll", + "strtol", + "strtoll", + "strtoul", + "strtoull", + "strtof", + "strtod", + "strtold", + "strtoimax", + "strtoumax", + // "strfromf", "strfromd", "strfroml", // C23? + // string manipulation: + "strcpy", + "strncpy", + "strlcpy", + "strcat", + "strncat", + "strlcat", + "strxfrm", + "strdup", + "strndup", + // string examination: + "strlen", + "strnlen", + "strcmp", + "strncmp", + "stricmp", + "strcasecmp", + "strcoll", + "strchr", + "strrchr", + "strspn", + "strcspn", + "strpbrk", + "strstr", + "strtok", + // "mem-" functions + "memchr", + "wmemchr", + "memcmp", + "wmemcmp", + "memcpy", + "memccpy", + "mempcpy", + "wmemcpy", + "memmove", + "wmemmove", + "memset", + "wmemset", + // IO: + "fread", + "fwrite", + "fgets", + "fgetws", + "gets", + "fputs", + "fputws", + "puts", + // others + "strerror_s", + "strerror_r", + "bcopy", + "bzero", + "bsearch", + "qsort", + }); + + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + // Match predefined names: + if (PredefinedNames->find(Name) != PredefinedNames->end()) + return true; + + std::string NameWCS = Name.str(); + size_t WcsPos = NameWCS.find("wcs"); + + while (WcsPos != std::string::npos) { + NameWCS[WcsPos++] = 's'; + NameWCS[WcsPos++] = 't'; + NameWCS[WcsPos++] = 'r'; + WcsPos = NameWCS.find("wcs", WcsPos); + } + if (PredefinedNames->find(NameWCS) != PredefinedNames->end()) + return true; + // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They + // all should end with "scanf"): + return Name.ends_with("scanf"); +} + +// Match a call to one of the `v*printf` functions taking `va_list`. We cannot +// check safety for these functions so they should be changed to their +// non-va_list versions. +AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf")) + return false; // neither printf nor scanf + return Name.starts_with("v"); +} + +// Matches a call to one of the `sprintf` functions as they are always unsafe +// and should be changed to `snprintf`. +AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf") || + // Let `isUnsafeVaListPrintfFunc` check for cases with va-list: + Name.starts_with("v")) + return false; + + StringRef Prefix = Name.drop_back(6); + + if (Prefix.ends_with("w")) + Prefix = Prefix.drop_back(1); + return Prefix == "s"; +} + +// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide +// character versions. Calls to these functions can be safe if their arguments +// are carefully made safe. +AST_MATCHER(FunctionDecl, isNormalPrintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf") || Name.starts_with("v")) + return false; + + StringRef Prefix = Name.drop_back(6); + + if (Prefix.ends_with("w")) + Prefix = Prefix.drop_back(1); + + return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn"; +} + +// This matcher requires that it is known that the callee `isNormalPrintf`. +// Then if the format string is a string literal, this matcher matches when at +// least one string argument is unsafe. If the format is not a string literal, +// this matcher matches when at least one pointer type argument is unsafe. +AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg, + clang::ast_matchers::internal::Matcher, + UnsafeStringArgMatcher) { + // Determine what printf it is: + const Expr *FirstArg = Node.getArg(0); + ASTContext &Ctx = Finder->getASTContext(); + + if (isa(FirstArg->IgnoreParenImpCasts())) { + // It is a printf/kprintf. And, the format is a string literal: + bool isKprintf = false; + const Expr *UnsafeArg; + + if (auto *Callee = Node.getDirectCallee()) + if (auto *II = Node.getDirectCallee()->getIdentifier()) + isKprintf = II->getName() == "kprintf"; + if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + + QualType PtrTy = FirstArg->getType(); + + assert(PtrTy->isPointerType()); + + QualType PteTy = (cast(PtrTy))->getPointeeType(); + + if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext, + there can't be any file pointer then */ + && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) { + // It is a fprintf: + const Expr *UnsafeArg; + + if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + + const Expr *SecondArg = Node.getArg(1); + + if (SecondArg->getType()->isIntegerType()) { + // It is a snprintf: + const Expr *UnsafeArg; + + if (unsigned UnsafeArgIdx = + hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + // It is printf but the format string is passed by pointer. The only thing we + // can do is to require all pointers to be null-terminated: + for (auto Arg : Node.arguments()) + if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) + if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder)) + return true; + return false; +} + +// This matcher requires that it is known that the callee `isNormalPrintf`. +// Then it matches if the first two arguments of the call is a pointer and an +// integer and they are not in a safe pattern. +// +// For the first two arguments: `ptr` and `size`, they are safe if in the +// following patterns: +// ptr := DRE.data(); +// size:= DRE.size()/DRE.size_bytes() +// And DRE is a hardened container or view. +AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) { + if (Node.getNumArgs() < 3) + return false; // not an snprintf call + + const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1); + + if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType()) + return false; // not an snprintf call + + static StringRef SizedObjs[] = {"span", "array", "vector", + "basic_string_view", "basic_string"}; + Buf = Buf->IgnoreParenImpCasts(); + Size = Size->IgnoreParenImpCasts(); + if (auto *MCEPtr = dyn_cast(Buf)) + if (auto *MCESize = dyn_cast(Size)) { + auto *DREOfPtr = dyn_cast( + MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts()); + auto *DREOfSize = dyn_cast( + MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts()); + + if (!DREOfPtr || !DREOfSize) + return true; // not in safe pattern + if (DREOfPtr->getDecl() != DREOfSize->getDecl()) + return true; // not in safe pattern + if (MCEPtr->getMethodDecl()->getName() != "data") + return true; // not in safe pattern + + if (MCESize->getMethodDecl()->getName() == "size_bytes" || + // Note here the pointer must be a pointer-to-char type unless there + // is explicit casting. If there is explicit casting, this branch + // is unreachable. Thus, at this branch "size" and "size_bytes" are + // equivalent as the pointer is a char pointer: + MCESize->getMethodDecl()->getName() == "size") + for (StringRef SizedObj : SizedObjs) + if (MCEPtr->getRecordDecl()->isInStdNamespace() && + MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() == + SizedObj) + return false; // It is in fact safe + } + return true; // ptr and size are not in safe pattern +} +} // namespace libc_func_matchers } // namespace clang::ast_matchers namespace { @@ -1030,6 +1450,97 @@ class DataInvocationGadget : public WarningGadget { DeclUseList getClaimedVarUseSites() const override { return {}; } }; +class UnsafeLibcFunctionCallGadget : public WarningGadget { + const CallExpr *const Call; + const Expr *UnsafeArg = nullptr; + constexpr static const char *const Tag = "UnsafeLibcFunctionCall"; + // Extra tags for additional information: + constexpr static const char *const UnsafeSprintfTag = + "UnsafeLibcFunctionCall_sprintf"; + constexpr static const char *const UnsafeSizedByTag = + "UnsafeLibcFunctionCall_sized_by"; + constexpr static const char *const UnsafeStringTag = + "UnsafeLibcFunctionCall_string"; + constexpr static const char *const UnsafeVaListTag = + "UnsafeLibcFunctionCall_va_list"; + + enum UnsafeKind { + OTHERS = 0, // no specific information, the callee function is unsafe + SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead. + SIZED_BY = + 2, // the first two arguments of `snprintf` function have + // "__sized_by" relation but they do not conform to safe patterns + STRING = 3, // an argument is a pointer-to-char-as-string but does not + // guarantee null-termination + VA_LIST = 4, // one of the `-printf`s function that take va_list, which is + // considered unsafe as it is not compile-time check + } WarnedFunKind = OTHERS; + +public: + UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result) + : WarningGadget(Kind::UnsafeLibcFunctionCall), + Call(Result.Nodes.getNodeAs(Tag)) { + if (Result.Nodes.getNodeAs(UnsafeSprintfTag)) + WarnedFunKind = SPRINTF; + else if (auto *E = Result.Nodes.getNodeAs(UnsafeStringTag)) { + WarnedFunKind = STRING; + UnsafeArg = E; + } else if (Result.Nodes.getNodeAs(UnsafeSizedByTag)) { + WarnedFunKind = SIZED_BY; + UnsafeArg = Call->getArg(0); + } else if (Result.Nodes.getNodeAs(UnsafeVaListTag)) + WarnedFunKind = VA_LIST; + } + + static Matcher matcher() { + return stmt(anyOf( + callExpr( + callee(functionDecl(anyOf( + // Match a predefined unsafe libc + // function: + functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()), + // Match a call to one of the `v*printf` functions + // taking va-list, which cannot be checked at + // compile-time: + functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc()) + .bind(UnsafeVaListTag), + // Match a call to a `sprintf` function, which is never + // safe: + functionDecl(libc_func_matchers::isUnsafeSprintfFunc()) + .bind(UnsafeSprintfTag)))), + // (unless the call has a sole string literal argument): + unless( + allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))), + + // The following two cases require checking against actual + // arguments of the call: + + // Match a call to an `snprintf` function. And first two + // arguments of the call (that describe a buffer) are not in + // safe patterns: + callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), + libc_func_matchers::hasUnsafeSnprintfBuffer()) + .bind(UnsafeSizedByTag), + // Match a call to a `printf` function, which can be safe if + // all arguments are null-terminated: + callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), + libc_func_matchers::hasUnsafePrintfStringArg( + expr().bind(UnsafeStringTag))))); + } + + const Stmt *getBaseStmt() const { return Call; } + + SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); } + + void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler, + bool IsRelatedToDecl, + ASTContext &Ctx) const override { + Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg); + } + + DeclUseList getClaimedVarUseSites() const override { return {}; } +}; + // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue // Context (see `isInUnspecifiedLvalueContext`). // Note here `[]` is the built-in subscript operator. diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index e6ce89dc7ec40..72078ae1534b0 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2304,6 +2304,20 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { } } + void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, + ASTContext &Ctx, + const Expr *UnsafeArg = nullptr) override { + S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call) + << Call->getDirectCallee() // We've checked there is a direct callee + << Call->getSourceRange(); + if (PrintfInfo > 0) { + SourceRange R = + UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange(); + S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call) + << PrintfInfo << R; + } + } + void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) override { diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp new file mode 100644 index 0000000000000..2bd12db93fd52 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp @@ -0,0 +1,60 @@ +// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ +// RUN: -verify %s + +namespace std { + inline namespace __1 { + template< class InputIt, class OutputIt > + OutputIt copy( InputIt first, InputIt last, + OutputIt d_first ); + + struct iterator{}; + template + struct span { + T * ptr; + T * data(); + unsigned size_bytes(); + unsigned size(); + iterator begin() const noexcept; + iterator end() const noexcept; + }; + + template + struct basic_string { + T* p; + T *c_str(); + T *data(); + unsigned size_bytes(); + }; + + typedef basic_string string; + typedef basic_string wstring; + + // C function under std: + void memcpy(); + void strcpy(); + int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); + } +} + +void f(char * p, char * q, std::span s) { + std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + std::__1::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::__1::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + + /* Test printfs */ + std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn + std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn +} + +void v(std::string s1) { + std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn + std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn +} + +void g(char *begin, char *end, char *p, std::span s) { + std::copy(begin, end, p); // no warn + std::copy(s.begin(), s.end(), s.begin()); // no warn +} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp new file mode 100644 index 0000000000000..1a29654f660c9 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp @@ -0,0 +1,106 @@ +// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ +// RUN: -verify %s + +typedef struct {} FILE; +void memcpy(); +void __asan_memcpy(); +void strcpy(); +void strcpy_s(); +void wcscpy_s(); +unsigned strlen( const char* str ); +int fprintf( FILE* stream, const char* format, ... ); +int printf( const char* format, ... ); +int sprintf( char* buffer, const char* format, ... ); +int swprintf( char* buffer, const char* format, ... ); +int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int snwprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... ); +int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int sscanf_s(const char * buffer, const char * format, ...); +int sscanf(const char * buffer, const char * format, ... ); + +namespace std { + template< class InputIt, class OutputIt > + OutputIt copy( InputIt first, InputIt last, + OutputIt d_first ); + + struct iterator{}; + template + struct span { + T * ptr; + T * data(); + unsigned size_bytes(); + unsigned size(); + iterator begin() const noexcept; + iterator end() const noexcept; + }; + + template + struct basic_string { + T* p; + T *c_str(); + T *data(); + unsigned size_bytes(); + }; + + typedef basic_string string; + typedef basic_string wstring; + + // C function under std: + void memcpy(); + void strcpy(); +} + +void f(char * p, char * q, std::span s, std::span s2) { + memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}} + __builtin___memcpy_chk(p, q, 8, 64); // expected-warning{{function '__builtin___memcpy_chk' is unsafe}} + __asan_memcpy(); // expected-warning{{function '__asan_memcpy' is unsafe}} + strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + strcpy_s(); // expected-warning{{function 'strcpy_s' is unsafe}} + wcscpy_s(); // expected-warning{{function 'wcscpy_s' is unsafe}} + + + /* Test printfs */ + fprintf((FILE*)p, "%s%d", p, *p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} + printf("%s%d", // expected-warning{{function 'printf' is unsafe}} + p, // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument + *p); + sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} + swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} + snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snwprintf_s( // expected-warning{{function 'snwprintf_s' is unsafe}} + s.data(), // expected-note{{buffer pointer and size may not match}} // note attached to the buffer + s2.size(), + "%s%d", "hello", *p); + vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}} + sscanf(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf' is unsafe}} + sscanf_s(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf_s' is unsafe}} + fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} + fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn + printf("%s%d", "hello", *p); // no warn + snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn + snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + strlen("hello");// no warn +} + +void v(std::string s1, int *p) { + snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn + printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn + fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn +} + + +void g(char *begin, char *end, char *p, std::span s) { + std::copy(begin, end, p); // no warn + std::copy(s.begin(), s.end(), s.begin()); // no warn +} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp index 844311c3a51a5..989931e41c0cc 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp @@ -1,8 +1,6 @@ // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s -// expected-no-diagnostics - typedef unsigned __darwin_size_t; typedef __darwin_size_t size_t; #define bzero(s, n) __builtin_bzero(s, n) -void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } +void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}} From f574b9c9297538a8d471658564619be3ad6e87dd Mon Sep 17 00:00:00 2001 From: Edd Dawson Date: Wed, 4 Sep 2024 20:36:24 +0100 Subject: [PATCH 146/425] [PS4,PS5][Driver] Check for absent SDK when -nostdlib/-nodefaultlibs (#107112) The PlayStation drivers emit warnings if it looks like SDK libraries are missing. Until this point, the check was skipped when either `-nostdlib` or `-nodefaultlibs` was supplied. I believe the idea is that if you aren't linking default libraries, you won't be in need of the SDK. However, in a situation where these switches are supplied, users may still want to pass `-lSomeSDKLib` to the driver/linker with the expectation that libSomeSDKLib.a will be sourced from the SDK. That is, `-nodefaultlibs` and `-nostdlib` affect the libraries passed to the linker, but not the library search paths. So this change removes `-nostdlib`/`-nodefaultlibs` from consideration when deciding whether or not to probe for the SDK's existence. N.B. complete behaviour for `-nostdlib` and `-nodefaultlibs` is yet to be added to the PlayStation compiler drivers. Coming soon. SIE tracker: TOOLCHAIN-16704 --- clang/lib/Driver/ToolChains/PS4CPU.cpp | 4 +--- clang/test/Driver/ps4-sdk-root.c | 8 ++------ clang/test/Driver/ps5-sdk-root.c | 10 +++------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 22103eb50803a..54ec59e6398f8 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -354,9 +354,7 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple, SmallString<512> SDKLibDir(SDKRootDir); llvm::sys::path::append(SDKLibDir, "target/lib"); - if (!Args.hasArg(options::OPT_nostdlib) && - !Args.hasArg(options::OPT_nodefaultlibs) && - !Args.hasArg(options::OPT__sysroot_EQ) && !Args.hasArg(options::OPT_E) && + if (!Args.hasArg(options::OPT__sysroot_EQ) && !Args.hasArg(options::OPT_E) && !Args.hasArg(options::OPT_c) && !Args.hasArg(options::OPT_S) && !Args.hasArg(options::OPT_emit_ast) && !llvm::sys::fs::exists(SDKLibDir)) { diff --git a/clang/test/Driver/ps4-sdk-root.c b/clang/test/Driver/ps4-sdk-root.c index e1a04522030c1..3e02fa9fc3bc2 100644 --- a/clang/test/Driver/ps4-sdk-root.c +++ b/clang/test/Driver/ps4-sdk-root.c @@ -6,9 +6,8 @@ // Check that PS4 clang doesn't report a warning message when locating // system libraries (either by looking at the value of SCE_ORBIS_SDK_DIR -// or relative to the location of the compiler driver), if "-c", "-S", "-E", -// "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on -// the command line. +// or relative to the location of the compiler driver), if "-c", "-S", "-E" +// or "--sysroot" option is specified on the command line. // Otherwise, check that PS4 clang reports a warning. // Setting up SCE_ORBIS_SDK_DIR to existing location, which is not a PS4 SDK. @@ -36,9 +35,6 @@ // RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s // RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s -// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s -// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s - // NO-WARN-NOT: {{warning:|error:}} // WARN-SYS-HEADERS: warning: unable to find PS4 system headers directory // WARN-ISYSROOT: warning: no such sysroot directory: 'foo' diff --git a/clang/test/Driver/ps5-sdk-root.c b/clang/test/Driver/ps5-sdk-root.c index c3672aef9dc0c..2a82d8e72283b 100644 --- a/clang/test/Driver/ps5-sdk-root.c +++ b/clang/test/Driver/ps5-sdk-root.c @@ -8,12 +8,11 @@ // Check that PS5 clang doesn't report a warning message when locating // system libraries (either by looking at the value of SCE_PROSPERO_SDK_DIR -// or relative to the location of the compiler driver), if "-c", "-S", "-E", -// "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on -// the command line. +// or relative to the location of the compiler driver), if "-c", "-S", "-E" +// or "--sysroot" option is specified on the command line. // Otherwise, check that PS5 clang reports a warning. -// Setting up SCE_PROSPERO_SDK_DIR to existing location, which is not a PS4 SDK. +// Setting up SCE_PROSPERO_SDK_DIR to existing location, which is not a PS5 SDK. // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s @@ -38,9 +37,6 @@ // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s -// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s -// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s - // NO-WARN-NOT: {{warning:|error:}} // WARN-SYS-HEADERS: warning: unable to find PS5 system headers directory // WARN-ISYSROOT: warning: no such sysroot directory: 'foo' From c50fecaaaabcf1598dc25fbde24c8352745b4ac9 Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:37:14 -0500 Subject: [PATCH 147/425] [mlir] Fix region simplification bug when later blocks use prior block argument values (#97960) This fixes #94520 by ensuring that any if any block arguments are being used outside of the original block that the block is not considered a candidate for merging. More details: the root cause of the issue described in #94520 was that `^bb2` and `^bb5` were being merged despite `%4` (an argument to `^bb2`) was being used later in `^bb7`. When the block merge occurred, that unintentionally changed the value of `%4` for all downstream code. This change prevents that from happening. --- mlir/lib/Transforms/Utils/RegionUtils.cpp | 9 ++++++ .../Transforms/canonicalize-block-merge.mlir | 28 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index 3c7523827699c..b7d26e8823094 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -877,6 +877,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter, if (hasNonEmptyRegion) continue; + // Don't allow merging if this block's arguments are used outside of the + // original block. + bool argHasExternalUsers = llvm::any_of( + block->getArguments(), [block](mlir::BlockArgument &arg) { + return arg.isUsedOutsideOfBlock(block); + }); + if (argHasExternalUsers) + continue; + // Try to add this block to an existing cluster. bool addedToCluster = false; for (auto &cluster : clusters) diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir index 92cfde817cf7f..6dbfcb562e588 100644 --- a/mlir/test/Transforms/canonicalize-block-merge.mlir +++ b/mlir/test/Transforms/canonicalize-block-merge.mlir @@ -290,3 +290,31 @@ func.func @dead_dealloc_fold_multi_use(%cond : i1) { memref.dealloc %a: memref<4xf32> return } + +// CHECK-LABEL: func @nested_loop +func.func @nested_loop(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i1) { +// Irreducible control-flow: enter the middle of the loop in LoopBody_entry here. + "test.foo_br"(%arg0, %arg4)[^LoopBody_entry] : (i32, i32) -> () + +// Loop exit condition: jump to exit or LoobBody blocks +^Loop_header: // 2 preds: ^bb2, ^bb3 + // Consumes the block arg from LoopBody_entry + // Because of this use here, we can't merge the two blocks below. + "test.foo_br2"(%0)[^EXIT, ^LoopBody_entry, ^LoopBody_other] : (i32) -> () + +// LoopBody_entry is jumped in from the entry block (bb0) and Loop_header +// It **dominates** the Loop_header. +^LoopBody_entry(%0: i32): // 2 preds: ^bb0, ^Loop_header + // CHECK: test.bar + %1 = "test.bar"(%0) : (i32) -> i32 + cf.br ^Loop_header + +// Other block inside the loop, not dominating the header +^LoopBody_other(%2: i32): // pred: ^Loop_header + // CHECK: test.bar + %3 = "test.bar"(%2) : (i32) -> i32 + cf.br ^Loop_header + +^EXIT: // pred: ^Loop_header + return +} From 34f2c9a9ce73a61b27d75dab7e1eed256491afcc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 4 Sep 2024 20:44:13 +0100 Subject: [PATCH 148/425] [AArch64] Add tests for FP conversion with 3 element vectors. Add tests showing a number of cases where costs for floating point conversions are overestimated for vectors with 3 elements. --- .../CostModel/AArch64/vec3-fp-conversions.ll | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll diff --git a/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll b/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll new file mode 100644 index 0000000000000..334c6107eb383 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s + +define <3 x i32> @fptoui_v3f32_to_v3i32(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptoui_v3f32_to_v3i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = fptoui <3 x float> %in to <3 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv +; + %conv = fptoui <3 x float> %in to <3 x i32> + ret <3 x i32> %conv +} + +define <3 x i16> @fptoui_v3f32_to_v3i16(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptoui_v3f32_to_v3i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv +; + %conv = fptoui <3 x float> %in to <3 x i16> + ret <3 x i16> %conv +} + +define <3 x i8> @fptoui_v3f32_to_v3i8(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptoui_v3f32_to_v3i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv +; + %conv = fptoui <3 x float> %in to <3 x i8> + ret <3 x i8> %conv +} + +define <3 x i32> @fptosi_v3f32_to_v3i32(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptosi_v3f32_to_v3i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = fptosi <3 x float> %in to <3 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv +; + %conv = fptosi <3 x float> %in to <3 x i32> + ret <3 x i32> %conv +} + +define <3 x i16> @fptosi_v3f32_to_v3i16(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptosi_v3f32_to_v3i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv +; + %conv = fptosi <3 x float> %in to <3 x i16> + ret <3 x i16> %conv +} + +define <3 x i8> @fptosi_v3f32_to_v3i8(<3 x float> %in, ptr %dst) { +; CHECK-LABEL: 'fptosi_v3f32_to_v3i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv +; + %conv = fptosi <3 x float> %in to <3 x i8> + ret <3 x i8> %conv +} + +define <3 x float> @uitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) { +; CHECK-LABEL: 'uitofp_v3i32_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i32> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = uitofp <3 x i32> %in to <3 x float> + ret <3 x float> %conv +} + +define <3 x float> @uitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) { +; CHECK-LABEL: 'uitofp_v3i16_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i16> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = uitofp <3 x i16> %in to <3 x float> + ret <3 x float> %conv +} + +define <3 x float> @uitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) { +; CHECK-LABEL: 'uitofp_v3i8_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i8> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = uitofp <3 x i8> %in to <3 x float> + ret <3 x float> %conv +} + +define <3 x float> @sitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) { +; CHECK-LABEL: 'sitofp_v3i32_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i32> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = sitofp <3 x i32> %in to <3 x float> + ret <3 x float> %conv +} + +define <3 x float> @sitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) { +; CHECK-LABEL: 'sitofp_v3i16_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i16> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = sitofp <3 x i16> %in to <3 x float> + ret <3 x float> %conv +} + +define <3 x float> @sitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) { +; CHECK-LABEL: 'sitofp_v3i8_to_v3f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i8> %in to <3 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv +; + %conv = sitofp <3 x i8> %in to <3 x float> + ret <3 x float> %conv +} From 3fe6a064f15cd854fd497594cc20e8b680cd2133 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 4 Sep 2024 20:50:05 +0100 Subject: [PATCH 149/425] [LV] Check if compare is truncated directly in getInstructionCost. The current check for truncated compares in getInstructionCost misses cases where either the first or both operands are constants. Check directly if the compare is marked for truncation. In that case, the minimum bitwidth is that of the operands. The patch also adds asserts to ensure that. This fixes a divergence between legacy and VPlan-based cost model, where the legacy cost model incorrectly estimated the cost of compares with truncated operands. Fixes https://github.com/llvm/llvm-project/issues/107171. --- .../Transforms/Vectorize/LoopVectorize.cpp | 15 +++++- .../truncate-to-minimal-bitwidth-cost.ll | 51 +++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0200525a718d5..0ccf442dac999 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6606,9 +6606,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); - if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) - ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); + (void)Op0AsInstruction; + assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || + canTruncateToMinimalBitwidth(I, VF)) && + "truncating Op0 must imply truncating the compare"); + if (canTruncateToMinimalBitwidth(I, VF)) { + assert(!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || + MinBWs[I] == MinBWs[Op0AsInstruction] && + "if both the operand and the compare are marked for " + "truncation, they must have the same bitwidth"); + ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); + } + VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, cast(I)->getPredicate(), CostKind, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 3e2f290a497db..9fe5a2a6a3ecc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -221,7 +221,56 @@ exit: ret void } +; Test case for https://github.com/llvm/llvm-project/issues/107171. +define i8 @icmp_ops_narrowed_to_i1() #1 { +; CHECK-LABEL: define i8 @icmp_ops_narrowed_to_i1( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 0, 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[C]] to i64 +; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[EXT]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i8 +; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 100 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[TRUNC_LCSSA:%.*]] = phi i8 [ [[TRUNC]], %[[LOOP]] ], [ 0, %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[TRUNC_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ] + %c = icmp eq i8 0, 0 + %ext = zext i1 %c to i64 + %shr = lshr i64 %ext, 1 + %trunc = trunc i64 %shr to i8 + %iv.next = add i16 %iv, 1 + %ec = icmp eq i16 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret i8 %trunc +} + attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } +attributes #1 = { "target-features"="+64bit,+v" } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -232,4 +281,6 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. From 42b4092db99633ec53d136d5da7abfcfb236c14e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 4 Sep 2024 12:57:08 -0700 Subject: [PATCH 150/425] [RISCV] Precommit vmv.v.v with undef passthru tests --- llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll | 14 ++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll index 3952e48c5c28f..c1602c912da63 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll @@ -194,3 +194,17 @@ define @unfoldable_mismatched_sew( %passthr %b = call @llvm.riscv.vmv.v.v.nxv2i32( %passthru, %a.bitcast, iXLen %avl) ret %b } + + +define @undef_passthru( %passthru, %x, %y, iXLen %avl) { +; CHECK-LABEL: undef_passthru: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: ret + %a = call @llvm.riscv.vadd.nxv1i64.nxv1i64( %passthru, %x, %y, iXLen %avl) + %b = call @llvm.riscv.vmv.v.v.nxv1i64( undef, %a, iXLen %avl) + ret %b +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir index 771b2073370e6..7e2ac0e26f251 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir @@ -60,3 +60,17 @@ body: | %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 0 /* tu, mu */ ... +--- +name: undef_passthru +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: undef_passthru + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */ + %passthru:vr = COPY $v8 + %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */ From d21e731c42d6b967e29dbe2edc16c1b86885df0d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 12:59:53 -0700 Subject: [PATCH 151/425] [RISCV] Fix typos in comment. NFC --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index bc661c72e5ecc..2ea909b085a6d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19631,7 +19631,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, } // FastCC has less than 1% performance improvement for some particular -// benchmark. But theoretically, it may has benenfit for some cases. +// benchmark. But theoretically, it may have benefit for some cases. bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, From 23f6c3370b8bc0bf773e69a41bf90454c0a10120 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 16:39:09 -0400 Subject: [PATCH 152/425] [libc++][modules] Remove dependency on __algorithm/max from hypot.h (#107150) That dependency was added recently when we made improvements to std::hypot, but that resulted in `__math` depending on `__algorithm`, which is a very heavyweight module. This patch uses `__math::fmax` instead. --- libcxx/include/__math/hypot.h | 4 ++-- libcxx/test/libcxx/transitive_includes/cxx03.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx11.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx14.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx17.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx20.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx23.csv | 2 -- libcxx/test/libcxx/transitive_includes/cxx26.csv | 2 -- 8 files changed, 2 insertions(+), 16 deletions(-) diff --git a/libcxx/include/__math/hypot.h b/libcxx/include/__math/hypot.h index 2c2c9c38ab530..b2bf8e11c8ec2 100644 --- a/libcxx/include/__math/hypot.h +++ b/libcxx/include/__math/hypot.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___MATH_HYPOT_H #define _LIBCPP___MATH_HYPOT_H -#include <__algorithm/max.h> #include <__config> #include <__math/abs.h> #include <__math/exponential_functions.h> +#include <__math/min_max.h> #include <__math/roots.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_arithmetic.h> @@ -62,7 +62,7 @@ _LIBCPP_HIDE_FROM_ABI _Real __hypot(_Real __x, _Real __y, _Real __z) { const _Real __overflow_scale = __math::ldexp(_Real(1), -(__exp + 20)); // Scale arguments depending on their size - const _Real __max_abs = std::max(__math::fabs(__x), std::max(__math::fabs(__y), __math::fabs(__z))); + const _Real __max_abs = __math::fmax(__math::fabs(__x), __math::fmax(__math::fabs(__y), __math::fabs(__z))); _Real __scale; if (__max_abs > __overflow_threshold) { // x*x + y*y + z*z might overflow __scale = __overflow_scale; diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index fd47c65ffc307..3bf39ea17c912 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -131,8 +131,6 @@ chrono type_traits chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath type_traits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index 347d5d8796687..49125486cfcf6 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -131,8 +131,6 @@ chrono type_traits chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath type_traits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index 57bb2f25b3d62..28dfb320fe06c 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -132,8 +132,6 @@ chrono type_traits chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath type_traits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index 6826aeb75a83b..5b7b6cecf73f8 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -132,8 +132,6 @@ chrono type_traits chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath type_traits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 17169e385d544..84ea6433fb12d 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -137,8 +137,6 @@ chrono type_traits chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath type_traits cmath version diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 267dca3cc6c41..946ba486294d3 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -84,8 +84,6 @@ chrono string_view chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath version codecvt cctype diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 267dca3cc6c41..946ba486294d3 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -84,8 +84,6 @@ chrono string_view chrono vector chrono version cinttypes cstdint -cmath cstddef -cmath initializer_list cmath limits cmath version codecvt cctype From 5e19fd172063c8957a35c7fa3596620f79ebba97 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 16:39:55 -0400 Subject: [PATCH 153/425] [libc++][modules] Consolidate leaf modules into their own top-level modules (#107147) Some modules are leaf modules in the sense that they are not used by any other part of the headers. These leaf modules are easy to consolidate since there is no risk to create a cycle. As a result of regrouping these modules, several missing includes were found and fixed in this patch. --- libcxx/include/__filesystem/directory_entry.h | 1 + libcxx/include/__filesystem/path.h | 1 + libcxx/include/__mdspan/layout_left.h | 1 + libcxx/include/__mdspan/layout_right.h | 1 + libcxx/include/__mdspan/layout_stride.h | 4 + libcxx/include/module.modulemap | 117 +++++++----------- .../views/mdspan/CustomTestLayouts.h | 1 + .../views/mdspan/extents/comparison.pass.cpp | 5 +- .../views/mdspan/extents/conversion.pass.cpp | 5 +- .../views/mdspan/extents/ctad.pass.cpp | 1 + .../views/mdspan/extents/dextents.pass.cpp | 1 + .../views/mdspan/extents/dims.pass.cpp | 1 + .../mdspan/extents/index_type.verify.cpp | 1 + .../views/mdspan/extents/obs_static.pass.cpp | 1 + .../views/mdspan/extents/types.pass.cpp | 5 +- .../mdspan/layout_left/comparison.pass.cpp | 5 +- .../mdspan/layout_left/ctor.default.pass.cpp | 1 + .../mdspan/layout_left/ctor.extents.pass.cpp | 1 + .../layout_left/ctor.layout_right.pass.cpp | 3 +- .../layout_left/ctor.layout_stride.pass.cpp | 4 +- .../mdspan/layout_left/ctor.mapping.pass.cpp | 3 +- .../layout_left/index_operator.pass.cpp | 2 + .../mdspan/layout_left/properties.pass.cpp | 5 +- .../layout_left/required_span_size.pass.cpp | 2 +- .../layout_left/static_requirements.pass.cpp | 6 +- .../views/mdspan/layout_left/stride.pass.cpp | 4 +- .../mdspan/layout_right/comparison.pass.cpp | 5 +- .../mdspan/layout_right/ctor.default.pass.cpp | 1 + .../mdspan/layout_right/ctor.extents.pass.cpp | 1 + .../layout_right/ctor.layout_left.pass.cpp | 3 +- .../layout_right/ctor.layout_stride.pass.cpp | 4 +- .../mdspan/layout_right/ctor.mapping.pass.cpp | 3 +- .../layout_right/index_operator.pass.cpp | 2 + .../mdspan/layout_right/properties.pass.cpp | 5 +- .../layout_right/required_span_size.pass.cpp | 1 + .../layout_right/static_requirements.pass.cpp | 6 +- .../views/mdspan/layout_right/stride.pass.cpp | 4 +- .../mdspan/layout_stride/comparison.pass.cpp | 5 +- .../layout_stride/ctor.default.pass.cpp | 1 + .../layout_stride/ctor.extents_array.pass.cpp | 3 + .../layout_stride/ctor.extents_span.pass.cpp | 3 + .../ctor.strided_mapping.pass.cpp | 3 +- .../mdspan/layout_stride/deduction.pass.cpp | 11 +- .../layout_stride/index_operator.pass.cpp | 3 + .../is_exhaustive_corner_case.pass.cpp | 6 +- .../mdspan/layout_stride/properties.pass.cpp | 6 +- .../layout_stride/required_span_size.pass.cpp | 2 + .../static_requirements.pass.cpp | 6 +- .../mdspan/layout_stride/stride.pass.cpp | 4 +- .../views/mdspan/mdspan/assign.pass.cpp | 5 +- .../views/mdspan/mdspan/conversion.pass.cpp | 5 +- .../views/mdspan/mdspan/ctor.copy.pass.cpp | 5 +- .../views/mdspan/mdspan/ctor.default.pass.cpp | 5 +- .../mdspan/mdspan/ctor.dh_array.pass.cpp | 3 +- .../mdspan/mdspan/ctor.dh_extents.pass.cpp | 5 +- .../mdspan/mdspan/ctor.dh_integers.pass.cpp | 3 +- .../views/mdspan/mdspan/ctor.dh_map.pass.cpp | 5 +- .../mdspan/mdspan/ctor.dh_map_acc.pass.cpp | 5 +- .../views/mdspan/mdspan/ctor.dh_span.pass.cpp | 3 +- .../views/mdspan/mdspan/ctor.move.pass.cpp | 5 +- .../views/mdspan/mdspan/deduction.pass.cpp | 5 +- .../mdspan/mdspan/index_operator.pass.cpp | 1 + .../views/mdspan/mdspan/move.pass.cpp | 5 +- .../views/mdspan/mdspan/properties.pass.cpp | 5 +- .../views/mdspan/mdspan/swap.pass.cpp | 5 +- .../views/mdspan/mdspan/types.pass.cpp | 5 +- .../test_offset_time_zone.h | 1 + .../string_view.pass.cpp | 2 + .../string_view_local_time.pass.cpp | 2 + .../string_view_local_time_choose.pass.cpp | 2 +- .../string_view_sys_time.pass.cpp | 2 + ...ned_time_duration2_time_zone_ptr2.pass.cpp | 2 +- ...e_duration2_time_zone_ptr2_choose.pass.cpp | 2 +- .../sys_time.pass.cpp | 1 + .../time_zone_pointer.pass.cpp | 1 + .../time_zone_pointer_local_time.pass.cpp | 2 +- ...me_zone_pointer_local_time_choose.pass.cpp | 2 +- .../time_zone_pointer_sys_time.pass.cpp | 2 +- .../get_local_time.pass.cpp | 1 + .../get_sys_time.pass.cpp | 1 + .../get_time_zone.pass.cpp | 1 + .../operator_local_time.pass.cpp | 1 + .../operator_sys_time.pass.cpp | 1 + .../charconv.from.chars/integral.pass.cpp | 2 + .../integral.roundtrip.pass.cpp | 2 + .../from_chars_result.operator_bool.pass.cpp | 1 + .../charconv.syn/from_chars_result.pass.cpp | 1 + .../to_chars_result.operator_bool.pass.cpp | 1 + .../charconv.syn/to_chars_result.pass.cpp | 1 + .../charconv.to.chars/integral.pass.cpp | 1 + 90 files changed, 234 insertions(+), 146 deletions(-) diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h index 96d88dcd90b4c..2c638e7ee354b 100644 --- a/libcxx/include/__filesystem/directory_entry.h +++ b/libcxx/include/__filesystem/directory_entry.h @@ -20,6 +20,7 @@ #include <__filesystem/operations.h> #include <__filesystem/path.h> #include <__filesystem/perms.h> +#include <__fwd/ostream.h> #include <__system_error/errc.h> #include <__system_error/error_code.h> #include <__utility/move.h> diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index ff468d517722f..eef1fc0db3ea7 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -21,6 +21,7 @@ #include <__type_traits/is_pointer.h> #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> +#include <__utility/move.h> #include #include #include diff --git a/libcxx/include/__mdspan/layout_left.h b/libcxx/include/__mdspan/layout_left.h index d058cbccffd96..59574e83b0d7b 100644 --- a/libcxx/include/__mdspan/layout_left.h +++ b/libcxx/include/__mdspan/layout_left.h @@ -21,6 +21,7 @@ #include <__config> #include <__fwd/mdspan.h> #include <__mdspan/extents.h> +#include <__type_traits/common_type.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> diff --git a/libcxx/include/__mdspan/layout_right.h b/libcxx/include/__mdspan/layout_right.h index 6842e9dc37fdc..d1acdb41238f7 100644 --- a/libcxx/include/__mdspan/layout_right.h +++ b/libcxx/include/__mdspan/layout_right.h @@ -21,6 +21,7 @@ #include <__config> #include <__fwd/mdspan.h> #include <__mdspan/extents.h> +#include <__type_traits/common_type.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> diff --git a/libcxx/include/__mdspan/layout_stride.h b/libcxx/include/__mdspan/layout_stride.h index 86148ac849eca..704a5a4c1aea5 100644 --- a/libcxx/include/__mdspan/layout_stride.h +++ b/libcxx/include/__mdspan/layout_stride.h @@ -18,12 +18,15 @@ #define _LIBCPP___MDSPAN_LAYOUT_STRIDE_H #include <__assert> +#include <__concepts/same_as.h> #include <__config> #include <__fwd/mdspan.h> #include <__mdspan/extents.h> +#include <__type_traits/common_type.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> +#include <__type_traits/is_same.h> #include <__utility/as_const.h> #include <__utility/integer_sequence.h> #include <__utility/swap.h> @@ -31,6 +34,7 @@ #include #include #include +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 297d155cb5594..7cde21417561e 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -29,6 +29,16 @@ module std_bitset [system] { } module std_charconv [system] { header "charconv" + module chars_format { header "__charconv/chars_format.h" } + module from_chars_integral { header "__charconv/from_chars_integral.h" } + module from_chars_result { header "__charconv/from_chars_result.h" } + module tables { header "__charconv/tables.h" } + module to_chars { header "__charconv/to_chars.h" } + module to_chars_base_10 { header "__charconv/to_chars_base_10.h" } + module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" } + module to_chars_integral { header "__charconv/to_chars_integral.h" } + module to_chars_result { header "__charconv/to_chars_result.h" } + module traits { header "__charconv/traits.h" } export * } module std_chrono [system] { @@ -53,10 +63,15 @@ module std_concepts [system] { } module std_condition_variable [system] { header "condition_variable" + module condition_variable { header "__condition_variable/condition_variable.h" } export * } module std_coroutine [system] { header "coroutine" + module coroutine_handle { header "__coroutine/coroutine_handle.h" } + module coroutine_traits { header "__coroutine/coroutine_traits.h" } + module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" } + module trivial_awaitables { header "__coroutine/trivial_awaitables.h" } export * } module std_deque [system] { @@ -77,6 +92,28 @@ module std_expected [system] { } module std_filesystem [system] { header "filesystem" + module copy_options { header "__filesystem/copy_options.h" } + module directory_entry { header "__filesystem/directory_entry.h" } + module directory_iterator { header "__filesystem/directory_iterator.h" } + module directory_options { header "__filesystem/directory_options.h" } + module file_status { header "__filesystem/file_status.h" } + module file_time_type { header "__filesystem/file_time_type.h" } + module file_type { header "__filesystem/file_type.h" } + module filesystem_error { + header "__filesystem/filesystem_error.h" + export std_private_memory_shared_ptr + } + module operations { header "__filesystem/operations.h" } + module path { + header "__filesystem/path.h" + export std_string // returned by various methods + } + module path_iterator { header "__filesystem/path_iterator.h" } + module perm_options { header "__filesystem/perm_options.h" } + module perms { header "__filesystem/perms.h" } + module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" } + module space_info { header "__filesystem/space_info.h" } + module u8path { header "__filesystem/u8path.h" } export * } module std_format [system] { @@ -149,6 +186,16 @@ module std_map [system] { } module std_mdspan [system] { header "mdspan" + module default_accessor { header "__mdspan/default_accessor.h" } + module extents { header "__mdspan/extents.h" } + module fwd { header "__fwd/mdspan.h" } + module layout_left { header "__mdspan/layout_left.h" } + module layout_right { header "__mdspan/layout_right.h" } + module layout_stride { header "__mdspan/layout_stride.h" } + module mdspan { + header "__mdspan/mdspan.h" + export std_array // for strides() + } export * } module std_memory [system] { @@ -1082,23 +1129,6 @@ module std_private_bit_invert_if [system] { header "__bit/invert_if.h" } module std_private_bit_popcount [system] { header "__bit/popcount.h" } module std_private_bit_rotate [system] { header "__bit/rotate.h" } -module std_private_charconv_chars_format [system] { header "__charconv/chars_format.h" } -module std_private_charconv_from_chars_integral [system] { header "__charconv/from_chars_integral.h" } -module std_private_charconv_from_chars_result [system] { header "__charconv/from_chars_result.h" } -module std_private_charconv_tables [system] { header "__charconv/tables.h" } -module std_private_charconv_to_chars [system] { header "__charconv/to_chars.h" } -module std_private_charconv_to_chars_base_10 [system] { header "__charconv/to_chars_base_10.h" } -module std_private_charconv_to_chars_floating_point [system] { header "__charconv/to_chars_floating_point.h" } -module std_private_charconv_to_chars_integral [system] { - header "__charconv/to_chars_integral.h" - export std_private_charconv_traits -} -module std_private_charconv_to_chars_result [system] { - header "__charconv/to_chars_result.h" - export * -} -module std_private_charconv_traits [system] { header "__charconv/traits.h" } - module std_private_chrono_calendar [system] { header "__chrono/calendar.h" } module std_private_chrono_concepts [system] { header "__chrono/concepts.h" } module std_private_chrono_convert_to_timespec [system] { header "__chrono/convert_to_timespec.h" } @@ -1223,16 +1253,6 @@ module std_private_concepts_semiregular [system] { header "__concepts/ module std_private_concepts_swappable [system] { header "__concepts/swappable.h" } module std_private_concepts_totally_ordered [system] { header "__concepts/totally_ordered.h" } -module std_private_condition_variable_condition_variable [system] { - header "__condition_variable/condition_variable.h" - export * -} - -module std_private_coroutine_coroutine_handle [system] { header "__coroutine/coroutine_handle.h" } -module std_private_coroutine_coroutine_traits [system] { header "__coroutine/coroutine_traits.h" } -module std_private_coroutine_noop_coroutine_handle [system] { header "__coroutine/noop_coroutine_handle.h" } -module std_private_coroutine_trivial_awaitables [system] { header "__coroutine/trivial_awaitables.h" } - module std_private_debug_utils_randomize_range [system] { header "__debug_utils/randomize_range.h" } module std_private_debug_utils_sanitizers [system] { header "__debug_utils/sanitizers.h" } module std_private_debug_utils_strict_weak_ordering_check [system] { @@ -1256,38 +1276,6 @@ module std_private_expected_expected [system] { header "__expected/ex module std_private_expected_unexpect [system] { header "__expected/unexpect.h" } module std_private_expected_unexpected [system] { header "__expected/unexpected.h" } -module std_private_filesystem_copy_options [system] { header "__filesystem/copy_options.h" } -module std_private_filesystem_directory_entry [system] { - header "__filesystem/directory_entry.h" - export * -} -module std_private_filesystem_directory_iterator [system] { - header "__filesystem/directory_iterator.h" - export * -} -module std_private_filesystem_directory_options [system] { header "__filesystem/directory_options.h" } -module std_private_filesystem_file_status [system] { header "__filesystem/file_status.h" } -module std_private_filesystem_file_time_type [system] { header "__filesystem/file_time_type.h" } -module std_private_filesystem_file_type [system] { header "__filesystem/file_type.h" } -module std_private_filesystem_filesystem_error [system] { - header "__filesystem/filesystem_error.h" - export * -} -module std_private_filesystem_operations [system] { header "__filesystem/operations.h" } -module std_private_filesystem_path [system] { - header "__filesystem/path.h" - export * -} -module std_private_filesystem_path_iterator [system] { header "__filesystem/path_iterator.h" } -module std_private_filesystem_perm_options [system] { header "__filesystem/perm_options.h" } -module std_private_filesystem_perms [system] { header "__filesystem/perms.h" } -module std_private_filesystem_recursive_directory_iterator [system] { - header "__filesystem/recursive_directory_iterator.h" - export * -} -module std_private_filesystem_space_info [system] { header "__filesystem/space_info.h" } -module std_private_filesystem_u8path [system] { header "__filesystem/u8path.h" } - module std_private_format_buffer [system] { header "__format/buffer.h" } module std_private_format_concepts [system] { header "__format/concepts.h" } module std_private_format_container_adaptor [system] { header "__format/container_adaptor.h" } @@ -1496,17 +1484,6 @@ module std_private_math_special_functions [system] { header "__mat module std_private_math_traits [system] { header "__math/traits.h" } module std_private_math_trigonometric_functions [system] { header "__math/trigonometric_functions.h" } -module std_private_mdspan_default_accessor [system] { header "__mdspan/default_accessor.h" } -module std_private_mdspan_extents [system] { - header "__mdspan/extents.h" - export * -} -module std_private_mdspan_layout_left [system] { header "__mdspan/layout_left.h" } -module std_private_mdspan_layout_right [system] { header "__mdspan/layout_right.h" } -module std_private_mdspan_layout_stride [system] { header "__mdspan/layout_stride.h" } -module std_private_mdspan_mdspan [system] { header "__mdspan/mdspan.h" } -module std_private_mdspan_mdspan_fwd [system] { header "__fwd/mdspan.h" } - module std_private_memory_addressof [system] { header "__memory/addressof.h" } module std_private_memory_align [system] { header "__memory/align.h" } module std_private_memory_aligned_alloc [system] { header "__memory/aligned_alloc.h" } diff --git a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h index 588a5e9774a55..7cd42139758e3 100644 --- a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h +++ b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h @@ -25,6 +25,7 @@ #include #include #include +#include // dynamic_extent #include #include diff --git a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp index 77fbd46fb7ca7..574290ebec854 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp @@ -18,9 +18,10 @@ // #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp index 6b0ecff02baab..f6834b0b4133e 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp @@ -29,10 +29,11 @@ // (numeric_limits::max() < numeric_limits::max()) #include -#include -#include #include +#include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp index 9144bb6812e3c..1a6501b391396 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp @@ -18,6 +18,7 @@ #include #include +#include // dynamic_extent #include #include "../ConvertibleToIntegral.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp index a9fc8f3bed074..2adfa49d3bc47 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp @@ -18,6 +18,7 @@ #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp index e74bc0e66fca1..0476c11efdb64 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp @@ -18,6 +18,7 @@ #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp b/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp index ba6941a1ab4c1..cdc0464251419 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp @@ -19,6 +19,7 @@ #include #include #include +#include // dynamic_extent void invalid_index_types() { // expected-error@*:* {{static assertion failed: extents::index_type must be a signed or unsigned integer type}} diff --git a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp index 0c8d3415a6726..29dd9e2d27072 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp @@ -28,6 +28,7 @@ #include #include +#include // dynamic_extent #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp index dbaff46e82b6b..2924da91f77ee 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp @@ -23,9 +23,10 @@ // } #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp index ab85ccf863963..c8b4083291a68 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp @@ -16,9 +16,10 @@ // Constraints: extents_type::rank() == OtherExtents::rank() is true. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp index ca478d047549e..5a4040317d243 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp @@ -17,6 +17,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp index 5147539deed26..46505cb961bbd 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp @@ -20,6 +20,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp index 1b5a985c1eb33..5f9bd4344d0ec 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp @@ -21,9 +21,10 @@ // Preconditions: other.required_span_size() is representable as a value of type index_type #include -#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp index 40ecef865477b..34489b7c52d7d 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp @@ -24,9 +24,11 @@ // Effects: Direct-non-list-initializes extents_ with other.extents(). #include -#include +#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp index ecfbd3fef705d..63b3c50c73175 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp @@ -19,9 +19,10 @@ // Preconditions: other.required_span_size() is representable as a value of type index_type #include -#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp index 1b6cb5ab3fb25..40cd6bc2812e3 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp @@ -24,8 +24,10 @@ // * extents_type::index-cast(i) is a multidimensional index in extents_. #include +#include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp index e4ab972cb1093..cca1c65f088f1 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp @@ -27,9 +27,10 @@ // } #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp index d0ee090099cf2..4cb111d29827a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp @@ -14,10 +14,10 @@ // // Returns: extents().fwd-prod-of-extents(extents_type::rank()). - #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp index 23a7c8a09005c..7a6add60efcd1 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp @@ -77,9 +77,11 @@ // Returns: true only if m.is_strided() is true for all possible objects m of type M. #include -#include -#include #include +#include +#include // dynamic_extent +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp index b5fc73b6fd5bf..064c279bcc49f 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp @@ -19,10 +19,12 @@ // Returns: extents().rev-prod-of-extents(i). #include -#include #include +#include #include #include +#include // dynamic_extent + #include "test_macros.h" template diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp index 6ca9041fbf2f8..03c78ca5e91d9 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp @@ -16,9 +16,10 @@ // Constraints: extents_type::rank() == OtherExtents::rank() is true. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp index 71242faa05a32..f02174416f33c 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp @@ -17,6 +17,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp index d9fa771fcfcd1..9c2c39bc3cb3a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp @@ -20,6 +20,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp index 13f2354c61407..61aba5dae6829 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp @@ -21,9 +21,10 @@ // Preconditions: other.required_span_size() is representable as a value of type index_type #include -#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp index 3b7e793c69a17..3bc7d82f8ed8d 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp @@ -24,9 +24,11 @@ // Effects: Direct-non-list-initializes extents_ with other.extents(). #include -#include +#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp index dd71f3b2af5f2..eeea5ab021e97 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp @@ -19,9 +19,10 @@ // Preconditions: other.required_span_size() is representable as a value of type index_type #include -#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp index 879e6713376d6..989078f17d303 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp @@ -26,6 +26,8 @@ #include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp index 94ffb1a4db5b4..120cefd619a3a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp @@ -27,9 +27,10 @@ // } #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp index a2efa4ade6a0b..0128d8c26a83e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp @@ -18,6 +18,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp index c4e3d89cb94f4..2b11d17c6717a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp @@ -77,9 +77,11 @@ // Returns: true only if m.is_strided() is true for all possible objects m of type M. #include -#include -#include #include +#include +#include // dynamic_extent +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp index 9a27859afb959..c04f07847c0be 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp @@ -19,10 +19,12 @@ // Returns: extents().rev-prod-of-extents(i). #include -#include #include +#include #include #include +#include // dynamic_extent + #include "test_macros.h" template diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp index 7b452cc43f97d..7c9b4a4ded34d 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp @@ -23,9 +23,10 @@ // Returns: true if x.extents() == y.extents() is true, OFFSET(y) == 0 is true, and each of x.stride(r) == y.stride(r) is true for r in the range [0, x.extents().rank()). Otherwise, false. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp index 2fa8866058720..108c4c6fca98e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp @@ -23,6 +23,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp index f9157f8c6eedd..cecfb79ea6867 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp @@ -28,8 +28,11 @@ // direct-non-list-initializes strides_[d] with as_const(s[d]). #include +#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" #include "../ConvertibleToIntegral.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp index 36a87ae7a9e84..d0f26ad23df98 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp @@ -28,8 +28,11 @@ // direct-non-list-initializes strides_[d] with as_const(s[d]). #include +#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" #include "../ConvertibleToIntegral.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp index 2f73ed512fe45..6ba67ea2d0122 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp @@ -36,9 +36,10 @@ // is-mapping-of)) #include -#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp index 259f61536ee37..45e2a1c10d6ff 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp @@ -11,9 +11,12 @@ // #include -#include -#include +#include #include +#include +#include +#include // dynamic_extent +#include #include "test_macros.h" @@ -33,7 +36,7 @@ constexpr bool test() { ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents(), std::array{1})), std::layout_stride::mapping>); ASSERT_SAME_TYPE( - decltype(std::layout_stride::mapping(std::extents(), std::array{3, 100})), + decltype(std::layout_stride::mapping(std::extents(), std::array{3, 100})), std::layout_stride::mapping>); ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents(), std::span())), @@ -43,7 +46,7 @@ constexpr bool test() { ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents(), std::declval>())), std::layout_stride::mapping>); ASSERT_SAME_TYPE( - decltype(std::layout_stride::mapping(std::extents(), std::declval>())), + decltype(std::layout_stride::mapping(std::extents(), std::declval>())), std::layout_stride::mapping>); return true; } diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp index 30281a8d922d1..5669991b8a13a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp @@ -24,8 +24,11 @@ // * extents_type::index-cast(i) is a multidimensional index in extents_. #include +#include #include #include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp index a4218f34105ad..589e32f86e39d 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp @@ -20,9 +20,11 @@ // - Otherwise, false. #include -#include -#include +#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp index a5f77a6685470..b1eb84b375b6e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp @@ -39,9 +39,11 @@ // - Otherwise, false. #include -#include -#include +#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp index 3b6af9f6c1ff0..870518994a939 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp @@ -20,8 +20,10 @@ // Returns: REQUIRED-SPAN-SIZE(extents(), strides_). #include +#include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp index 8c5bf16b37c1e..a69fb4f287c3e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp @@ -77,9 +77,11 @@ // Returns: true only if m.is_strided() is true for all possible objects m of type M. #include -#include -#include #include +#include +#include // dynamic_extent +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp index 07a5199e5bbb1..2f16b1f6ec9aa 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp @@ -19,10 +19,12 @@ // Returns: extents().rev-prod-of-extents(i). #include -#include #include +#include #include #include +#include // dynamic_extent + #include "test_macros.h" template diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp index 9e128f5180786..4c59b5b61a5a4 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp @@ -12,9 +12,10 @@ // constexpr mdspan& operator=(const mdspan& rhs) = default; #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp index db22e148b34c5..5fe14c1562f4d 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp @@ -39,9 +39,10 @@ // || !is_convertible_v #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp index 6d88e9ff02f96..9540c37cfc4b9 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp @@ -14,9 +14,10 @@ // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp index dbb2ad8b41bd4..5b84973bc43ef 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp @@ -22,9 +22,10 @@ // Effects: Value-initializes ptr_, map_, and acc_. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp index 0e0c7667da307..9ca4fe240495c 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp @@ -29,9 +29,10 @@ // - value-initializes acc_. #include -#include #include +#include #include +#include // dynamic_extent #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp index 40e82db986350..4d9f91f63d54c 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp @@ -24,9 +24,10 @@ // - value-initializes acc_. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp index c2e8d26cd87f6..381b2d3a8bc2a 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp @@ -30,9 +30,10 @@ // - value-initializes acc_. #include -#include #include +#include #include +#include // dynamic_extent #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp index fa65848ac69b5..f5e9c1b7e2a4c 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp @@ -22,9 +22,10 @@ // - value-initializes acc_. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp index 65d32f3d7a7f6..3239c1d65deb9 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp @@ -19,9 +19,10 @@ // - direct-non-list-initializes acc_ with a. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp index f4fb5e681d95f..2240dbe0801be 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp @@ -29,9 +29,10 @@ // - value-initializes acc_. #include -#include #include +#include #include +#include // dynamic_extent #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp index c843c6033524c..46ba2b2096c40 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp @@ -14,9 +14,10 @@ // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp index 876a3e84d6957..8b4d24d2a398d 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp @@ -52,9 +52,10 @@ // typename MappingType::layout_type, AccessorType>; #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp index ffb10c007222d..22020b1f64881 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp @@ -32,6 +32,7 @@ #include #include #include +#include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp index 9e1805431cd76..5ce2d06712bf6 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp @@ -14,9 +14,10 @@ // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp index ba1fef1df6779..6368acd0b0f41 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp @@ -52,9 +52,10 @@ // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types. #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp index a9a8e9a264c16..47f2abecade8f 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp @@ -17,9 +17,10 @@ // swap(x.acc_, y.acc_); #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp index 7e0235105b0b1..934e861e78d78 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp @@ -28,9 +28,10 @@ // }; #include -#include -#include #include +#include +#include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h index e9262c5d95db1..bbd01d17b9ce3 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h @@ -15,6 +15,7 @@ #include #include #include +#include #include enum class offset_time_zone_flags { diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp index c4c5e4f5b0231..18d13e32c63f0 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp @@ -20,7 +20,9 @@ // explicit zoned_time(string_view name); #include +#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp index c7fe8f24db687..34d8925709725 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp @@ -20,7 +20,9 @@ // zoned_time(string_view name, const local_time& st); #include +#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp index 69eb4a17aada4..9f82cab3461bd 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp @@ -20,8 +20,8 @@ // zoned_time(string_view name, const local_time& st, choose c); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp index 108a4b44706b7..46da996aa13db 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp @@ -20,7 +20,9 @@ // zoned_time(string_view name, const sys_time& st); #include +#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp index 68d7a5c74f47a..e4995f7400dac 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp @@ -21,8 +21,8 @@ // zoned_time(string_view name, const zoned_time& y); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp index a96975cc611f9..31f35fed0f985 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp @@ -21,8 +21,8 @@ // zoned_time(string_view name, const zoned_time& y, choose c); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp index 431e18f94243d..e4b9d10d2f0ac 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp @@ -20,6 +20,7 @@ // zoned_time(const sys_time& st); #include +#include #include #include diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp index f18be244684bf..5b805fb5abb90 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp @@ -20,6 +20,7 @@ // explicit zoned_time(TimeZonePtr z); #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp index 72116ca710e43..aa20e2420f4b9 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp @@ -20,8 +20,8 @@ // zoned_time(TimeZonePtr z, const local_time& st); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp index efe78141a5cf2..c78bf136fde44 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp @@ -20,8 +20,8 @@ // zoned_time(TimeZonePtr z, const local_time& st, choose c); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp index d0769cbe1408b..290e564848683 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp @@ -20,8 +20,8 @@ // zoned_time(TimeZonePtr z, const sys_time& st); #include -#include #include +#include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp index c465cda36c51e..017d073789f7d 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp @@ -20,6 +20,7 @@ // local_time get_local_time() const; #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp index 31fcbb689bd85..dde3cff9b5df8 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp @@ -20,6 +20,7 @@ // sys_time get_sys_time() const; #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp index af441b0857320..bc4c8be064f2c 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp @@ -20,6 +20,7 @@ // TimeZonePtr get_time_zone() const; #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp index 2a9c241997438..90d3484b2553c 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp @@ -20,6 +20,7 @@ // explicit operator local_time() const; #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp index 327ed495aed1a..68d8a15ec9889 100644 --- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp @@ -20,6 +20,7 @@ // operator sys_time() const; #include +#include #include #include "../test_offset_time_zone.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp index c88bf11d61e0e..4708da4c38b9b 100644 --- a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp @@ -14,6 +14,8 @@ // Integral& value, int base = 10) #include +#include + #include "test_macros.h" #include "charconv_test_helpers.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp index 5b5ea7c91e8af..b1f2c77b34f8a 100644 --- a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp @@ -16,6 +16,8 @@ // Integral& value, int base = 10) #include +#include + #include "test_macros.h" #include "charconv_test_helpers.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp index a6aa590ee944f..e0f20f1e8e3ff 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp index f218967b08196..24484cfe475f9 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp index 621eb8a493fd3..0908aa33ea7fc 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp index d4436d0c05b41..e746f19b2cd02 100644 --- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp index 420b0e168539d..e89b340bdfcbe 100644 --- a/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp @@ -19,6 +19,7 @@ #include #include +#include #include "test_macros.h" #include "charconv_test_helpers.h" From 63da545ccdd41d9eb2392a8d0e848a65eb24f5fa Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 4 Sep 2024 13:43:41 -0700 Subject: [PATCH 154/425] Revert "Reland "AtomicExpand: Allow incrementally legalizing atomicrmw"" (#107307) Reverts llvm/llvm-project#106793 `Next == E` is not enough: https://lab.llvm.org/buildbot/#/builders/169/builds/2834 `Next` is deleted by `processAtomicInstr` --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 35 +- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++----------- 5 files changed, 691 insertions(+), 836 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 2da723a0cc175..39a705599f90c 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -351,30 +351,17 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) { bool MadeChange = false; - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) { - BasicBlock *BB = &*BBI; - ++BBI; - - BasicBlock::iterator Next; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Instruction &Inst = *I; - Next = std::next(I); - - if (processAtomicInstr(&Inst)) { - MadeChange = true; - - // Detect control flow change and resume iteration from the original - // block to inspect any newly inserted blocks. This allows incremental - // legalization of atomicrmw and cmpxchg. - if (Next == E || BB != Next->getParent()) { - BBI = BB->getIterator(); - BBE = F.end(); - break; - } - } - } + SmallVector AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (Instruction &I : instructions(F)) + if (I.isAtomic() && !isa(&I)) + AtomicInsts.push_back(&I); + + for (auto *I : AtomicInsts) { + if (processAtomicInstr(I)) + MadeChange = true; } return MadeChange; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index ed9c1b037d0cc..0d230bb9dcc6e 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __adddf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 888b795876f7d..bfe0d20ca814b 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmax -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index a3665c6e42860..6b7d2df044460 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmin -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 7725ce0e73185..67e164037d5ce 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __subdf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload From 52dc4918ca8b874ddd4e4fcad873a66ecc5b6953 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 16:47:20 -0400 Subject: [PATCH 155/425] [libc++][NFC] Use consistent layout for license in Python files Most Python files were using `# === [...]` instead of `#=== [...]` so I went with what was the most common in the codebase. --- libcxx/test/libcxx/clang_modules_include.gen.py | 4 ++-- libcxx/test/libcxx/clang_tidy.gen.py | 4 ++-- libcxx/test/libcxx/double_include.gen.py | 4 ++-- libcxx/test/libcxx/header_inclusions.gen.py | 4 ++-- libcxx/test/libcxx/libcpp_version.gen.py | 4 ++-- libcxx/test/libcxx/no_assert_include.gen.py | 4 ++-- libcxx/test/libcxx/system_reserved_names.gen.py | 4 ++-- libcxx/test/libcxx/transitive_includes.gen.py | 4 ++-- libcxx/utils/adb_run.py | 4 ++-- libcxx/utils/ci/Dockerfile | 4 ++-- libcxx/utils/ci/apple-install-libcxx.sh | 4 ++-- libcxx/utils/ci/build-picolibc.sh | 4 ++-- libcxx/utils/ci/buildkite-pipeline.yml | 4 ++-- libcxx/utils/ci/run-buildbot | 4 ++-- libcxx/utils/ci/vendor/android/Dockerfile.emulator | 4 ++-- libcxx/utils/ci/vendor/android/build-emulator-images.sh | 4 ++-- libcxx/utils/ci/vendor/android/container-setup.sh | 4 ++-- libcxx/utils/ci/vendor/android/emulator-entrypoint.sh | 4 ++-- libcxx/utils/ci/vendor/android/emulator-functions.sh | 4 ++-- libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh | 4 ++-- libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh | 4 ++-- libcxx/utils/ci/vendor/android/start-emulator.sh | 4 ++-- libcxx/utils/ci/vendor/android/stop-emulator.sh | 4 ++-- libcxx/utils/libcxx/test/android.py | 4 ++-- 24 files changed, 48 insertions(+), 48 deletions(-) diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py index f084c38a8fbd3..f0421b2e73813 100644 --- a/libcxx/test/libcxx/clang_modules_include.gen.py +++ b/libcxx/test/libcxx/clang_modules_include.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that we can include each header in a TU while using modules. # This is important notably because the LLDB data formatters use diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py index 76b9db2d5cb83..5e84fbbb9913f 100644 --- a/libcxx/test/libcxx/clang_tidy.gen.py +++ b/libcxx/test/libcxx/clang_tidy.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Run our custom libc++ clang-tidy checks on all public headers. diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py index c7cb38b8f3590..afc2947dbece9 100644 --- a/libcxx/test/libcxx/double_include.gen.py +++ b/libcxx/test/libcxx/double_include.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that we can include each header in two TU's and link them together. diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py index faaa4cf8710c1..2ecc47cbb1891 100644 --- a/libcxx/test/libcxx/header_inclusions.gen.py +++ b/libcxx/test/libcxx/header_inclusions.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that all headers include all the other headers they're supposed to, as # prescribed by the Standard. diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py index 7d9519d58955b..a9995295e21e4 100644 --- a/libcxx/test/libcxx/libcpp_version.gen.py +++ b/libcxx/test/libcxx/libcpp_version.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that all headers define the _LIBCPP_VERSION macro. diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py index dd8006df93764..67ab98603ca8f 100644 --- a/libcxx/test/libcxx/no_assert_include.gen.py +++ b/libcxx/test/libcxx/no_assert_include.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Ensure that none of the standard C++ headers implicitly include cassert or # assert.h (because assert() is implemented as a macro). diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py index 956a8d1abe3c3..e29e7a2cdd614 100644 --- a/libcxx/test/libcxx/system_reserved_names.gen.py +++ b/libcxx/test/libcxx/system_reserved_names.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that headers are not tripped up by the surrounding code defining various # alphabetic macros. Also ensure that we don't swallow the definition of user diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py index 834f21f125437..22075364bf1b7 100644 --- a/libcxx/test/libcxx/transitive_includes.gen.py +++ b/libcxx/test/libcxx/transitive_includes.gen.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Test that we don't remove transitive includes of public C++ headers in the library accidentally. # When we remove a transitive public include, clients tend to break because they don't always diff --git a/libcxx/utils/adb_run.py b/libcxx/utils/adb_run.py index dc15b51d7f605..cddbd191e9881 100755 --- a/libcxx/utils/adb_run.py +++ b/libcxx/utils/adb_run.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## """adb_run.py is a utility for running a libc++ test program via adb. """ diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile index 490bee4942e03..dbecfd9fd3325 100644 --- a/libcxx/utils/ci/Dockerfile +++ b/libcxx/utils/ci/Dockerfile @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # This file defines the buildkite and github actions builder images. # You can build both images using: diff --git a/libcxx/utils/ci/apple-install-libcxx.sh b/libcxx/utils/ci/apple-install-libcxx.sh index 1ef52b11b70bd..1b1c30449d5af 100755 --- a/libcxx/utils/ci/apple-install-libcxx.sh +++ b/libcxx/utils/ci/apple-install-libcxx.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -e diff --git a/libcxx/utils/ci/build-picolibc.sh b/libcxx/utils/ci/build-picolibc.sh index 400e4dcab99a3..521c1bef9fc7e 100755 --- a/libcxx/utils/ci/build-picolibc.sh +++ b/libcxx/utils/ci/build-picolibc.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # This script builds picolibc (https://github.com/picolibc/picolibc) from diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml index d02d11ae7a756..906df734bc42b 100644 --- a/libcxx/utils/ci/buildkite-pipeline.yml +++ b/libcxx/utils/ci/buildkite-pipeline.yml @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # This file describes the various pre-commit CI bots used to test libc++. diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 102f1669e63b8..14ff611302981 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -ex set -o pipefail diff --git a/libcxx/utils/ci/vendor/android/Dockerfile.emulator b/libcxx/utils/ci/vendor/android/Dockerfile.emulator index 2f52b27f6edcd..6ce9b82d258eb 100644 --- a/libcxx/utils/ci/vendor/android/Dockerfile.emulator +++ b/libcxx/utils/ci/vendor/android/Dockerfile.emulator @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## FROM ubuntu:jammy diff --git a/libcxx/utils/ci/vendor/android/build-emulator-images.sh b/libcxx/utils/ci/vendor/android/build-emulator-images.sh index f467ffc6231f5..4e29c172e47ed 100755 --- a/libcxx/utils/ci/vendor/android/build-emulator-images.sh +++ b/libcxx/utils/ci/vendor/android/build-emulator-images.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -e diff --git a/libcxx/utils/ci/vendor/android/container-setup.sh b/libcxx/utils/ci/vendor/android/container-setup.sh index 56bc232fefa1e..40e405b65f833 100755 --- a/libcxx/utils/ci/vendor/android/container-setup.sh +++ b/libcxx/utils/ci/vendor/android/container-setup.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -e diff --git a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh index 99d4995b2ee1d..dcd7870e37a7d 100755 --- a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh +++ b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # This script is the entrypoint of an Android Emulator Docker container. diff --git a/libcxx/utils/ci/vendor/android/emulator-functions.sh b/libcxx/utils/ci/vendor/android/emulator-functions.sh index 27eea2af157cc..56770654989d7 100644 --- a/libcxx/utils/ci/vendor/android/emulator-functions.sh +++ b/libcxx/utils/ci/vendor/android/emulator-functions.sh @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Bash functions for managing the names of emulator system images. diff --git a/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh b/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh index 0c35794792891..73b657cb1433b 100755 --- a/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh +++ b/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -ex diff --git a/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh b/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh index 7de6cde7a7ad4..a822b26ffe6fd 100644 --- a/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh +++ b/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## export ADB_SERVER_SOCKET="tcp:$(docker inspect \ -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ diff --git a/libcxx/utils/ci/vendor/android/start-emulator.sh b/libcxx/utils/ci/vendor/android/start-emulator.sh index 2d6e272675ea0..bec8048e5c5fa 100755 --- a/libcxx/utils/ci/vendor/android/start-emulator.sh +++ b/libcxx/utils/ci/vendor/android/start-emulator.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # Starts a new Docker container using a Docker image containing the Android # Emulator and an OS image. Stops and removes the old container if it exists diff --git a/libcxx/utils/ci/vendor/android/stop-emulator.sh b/libcxx/utils/ci/vendor/android/stop-emulator.sh index b5797ccb344f4..4964fcf204505 100755 --- a/libcxx/utils/ci/vendor/android/stop-emulator.sh +++ b/libcxx/utils/ci/vendor/android/stop-emulator.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## set -e diff --git a/libcxx/utils/libcxx/test/android.py b/libcxx/utils/libcxx/test/android.py index a40305b4dca02..4536c9b156823 100644 --- a/libcxx/utils/libcxx/test/android.py +++ b/libcxx/utils/libcxx/test/android.py @@ -1,10 +1,10 @@ -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===----------------------------------------------------------------------===## +# ===----------------------------------------------------------------------===## import re import select From 16900d3b98e6c8fbdad4411a054e3566bbbf9235 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 4 Sep 2024 22:01:04 +0100 Subject: [PATCH 156/425] LICM: hoist BO assoc when BinOp is in RHS (#107072) Extend hoistBOAssociation smoothly to handle the case when the inner BinaryOperator is in the RHS of the outer BinaryOperator. This completes the generalization of hoistBOAssociation, and the only limitation after this patch is the fact that only Add and Mul are hoisted. --- llvm/lib/Transforms/Scalar/LICM.cpp | 26 ++-- llvm/test/Transforms/LICM/hoist-binop.ll | 161 ++++++++++++++++++++++- 2 files changed, 171 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 5cf7c252bb5f3..23e9c70b62642 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2803,15 +2803,14 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, /// Reassociate associative binary expressions of the form /// -/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" if op is an associative BinOp -/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is a commutative BinOp +/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" +/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" +/// 3. "C2 op (C1 op LV)" ==> "LV op (C1 op C2)" +/// 4. "C2 op (LV op C1)" ==> "LV op (C1 op C2)" /// -/// where LV is a loop variant, and C1 and C2 are loop invariants that we want -/// to hoist. -/// -/// TODO: This can be extended to more cases such as -/// 1. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" if op an associative BinOp -/// 2. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is a commutative BinOp +/// where op is an associative BinOp, LV is a loop variant, and C1 and C2 are +/// loop invariants that we want to hoist, noting that associativity implies +/// commutativity. static bool hoistBOAssociation(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, @@ -2825,19 +2824,20 @@ static bool hoistBOAssociation(Instruction &I, Loop &L, if (Opcode != Instruction::Add && Opcode != Instruction::Mul) return false; - auto *BO0 = dyn_cast(BO->getOperand(0)); + bool LVInRHS = L.isLoopInvariant(BO->getOperand(0)); + auto *BO0 = dyn_cast(BO->getOperand(LVInRHS)); if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() || BO0->hasNUsesOrMore(3)) return false; Value *LV = BO0->getOperand(0); Value *C1 = BO0->getOperand(1); - Value *C2 = BO->getOperand(1); + Value *C2 = BO->getOperand(!LVInRHS); - if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1)) { - assert(BO0->isCommutative() && "Associativity implies commutativity"); + assert(BO->isCommutative() && BO0->isCommutative() && + "Associativity implies commutativity"); + if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1)) std::swap(LV, C1); - } if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2)) return false; diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll index b0ee45a5fb350..a840e24757884 100644 --- a/llvm/test/Transforms/LICM/hoist-binop.ll +++ b/llvm/test/Transforms/LICM/hoist-binop.ll @@ -67,8 +67,8 @@ loop: br label %loop } - -; Hoist ADD and copy NUW if both ops have it. Commutative version. +; Hoist ADD and copy NUW if both ops have it. +; Version where operands are commuted. define void @add_nuw_comm(i64 %c1, i64 %c2) { ; CHECK-LABEL: @add_nuw_comm( ; CHECK-NEXT: entry: @@ -92,6 +92,83 @@ loop: br label %loop } +; Hoist ADD and copy NUW if both ops have it. +; Another version where operands are commuted. +define void @add_nuw_comm2(i64 %c1, i64 %c2) { +; CHECK-LABEL: @add_nuw_comm2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = add nuw i64 %index, %c1 + call void @use(i64 %step.add) + %index.next = add nuw i64 %c2, %step.add + br label %loop +} + +; Hoist ADD and copy NUW if both ops have it. +; Another version where operands are commuted. +define void @add_nuw_comm3(i64 %c1, i64 %c2) { +; CHECK-LABEL: @add_nuw_comm3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = add nuw i64 %c1, %index + call void @use(i64 %step.add) + %index.next = add nuw i64 %c2, %step.add + br label %loop +} + +; Hoist ADD and copy NUW if both ops have it. +; A version where the LHS and RHS of the outer BinOp are BinOps. +define void @add_nuw_twobinops(i64 %c1, i64 %c2) { +; CHECK-LABEL: @add_nuw_twobinops( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C2_PLUS_2:%.*]] = add nuw i64 [[C2:%.*]], 2 +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2_PLUS_2]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = add nuw i64 %c1, %index + call void @use(i64 %step.add) + %c2.plus.2 = add nuw i64 %c2, 2 + %index.next = add nuw i64 %step.add, %c2.plus.2 + br label %loop +} + ; Hoist MUL and drop NUW even if both ops have it. define void @mul_nuw(i64 %c1, i64 %c2) { ; CHECK-LABEL: @mul_nuw( @@ -116,7 +193,8 @@ loop: br label %loop } -; Hoist MUL and drop NUW even if both ops have it. Commutative version. +; Hoist MUL and drop NUW even if both ops have it. +; Version where operands are commuted. define void @mul_nuw_comm(i64 %c1, i64 %c2) { ; CHECK-LABEL: @mul_nuw_comm( ; CHECK-NEXT: entry: @@ -140,6 +218,83 @@ loop: br label %loop } +; Hoist MUL and drop NUW even if both ops have it. +; Another version where operands are commuted. +define void @mul_nuw_comm2(i64 %c1, i64 %c2) { +; CHECK-LABEL: @mul_nuw_comm2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = mul nuw i64 [[INDEX]], [[C1]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = mul nuw i64 %index, %c1 + call void @use(i64 %step.add) + %index.next = mul nuw i64 %c2, %step.add + br label %loop +} + +; Hoist MUL and drop NUW even if both ops have it. +; Another version where operands are commuted. +define void @mul_nuw_comm3(i64 %c1, i64 %c2) { +; CHECK-LABEL: @mul_nuw_comm3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = mul nuw i64 %c1, %index + call void @use(i64 %step.add) + %index.next = mul nuw i64 %c2, %step.add + br label %loop +} + +; Hoist MUL and drop NUW even if both ops have it. +; A version where the LHS and RHS of the outer BinOp are BinOps. +define void @mul_nuw_twobinops(i64 %c1, i64 %c2) { +; CHECK-LABEL: @mul_nuw_twobinops( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C2_PLUS_2:%.*]] = add nuw i64 [[C2:%.*]], 2 +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2_PLUS_2]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]] +; CHECK-NEXT: call void @use(i64 [[STEP_ADD]]) +; CHECK-NEXT: [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] + %step.add = mul nuw i64 %c1, %index + call void @use(i64 %step.add) + %c2.plus.2 = add nuw i64 %c2, 2 + %index.next = mul nuw i64 %step.add, %c2.plus.2 + br label %loop +} + ; Hoist ADD but don't copy NUW if only one op has it. define void @add_no_nuw(i64 %c1, i64 %c2) { ; CHECK-LABEL: @add_no_nuw( From 1ff8657b26870e9db4527b621fab0d21b6cbdc3c Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Wed, 4 Sep 2024 14:12:25 -0700 Subject: [PATCH 157/425] [scudo] Use variable instead of recomputing. (#106647) In the get fragmentation functions, there is already a variable that computes the in use bytes, so use that instead of recomputing it. --- compiler-rt/lib/scudo/standalone/primary32.h | 2 +- compiler-rt/lib/scudo/standalone/primary64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index ebfb8dfe0a31f..87264471be2c1 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -940,7 +940,7 @@ template class SizeClassAllocator32 { uptr Integral; uptr Fractional; - computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral, + computePercentage(BlockSize * InUseBlocks, InUseBytes, &Integral, &Fractional); Str->append(" %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total " "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n", diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 8436f33c2fdcf..ffcc22fea0c6e 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -1185,7 +1185,7 @@ template class SizeClassAllocator64 { uptr Integral; uptr Fractional; - computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral, + computePercentage(BlockSize * InUseBlocks, InUseBytes, &Integral, &Fractional); Str->append(" %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total " "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n", From dd754cd262222bcb489038ac791e4278d90697f0 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Wed, 4 Sep 2024 14:32:46 -0700 Subject: [PATCH 158/425] [compiler-rt][nsan] Update UnwindImpl (#107313) Implement __sanitizer::BufferedStackTrace::UnwindImpl following msan. --- compiler-rt/lib/nsan/nsan.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index 4be9c673bd4e0..4679bcd589eb4 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -200,7 +200,14 @@ void __sanitizer::BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, bool request_fast, u32 max_depth) { using namespace __nsan; - return Unwind(max_depth, pc, bp, context, 0, 0, false); + NsanThread *t = GetCurrentThread(); + if (!t || !StackTrace::WillUseFastUnwind(request_fast)) + return Unwind(max_depth, pc, bp, context, t ? t->stack_top() : 0, + t ? t->stack_bottom() : 0, false); + if (StackTrace::WillUseFastUnwind(request_fast)) + Unwind(max_depth, pc, bp, nullptr, t->stack_top(), t->stack_bottom(), true); + else + Unwind(max_depth, pc, 0, context, 0, 0, false); } extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_print_accumulated_stats() { From dcf0160bd61d150e7b94067fcd991b466a361b08 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 4 Sep 2024 14:46:48 -0700 Subject: [PATCH 159/425] [TableGen] Optimize intrinsic info type signature encoding (#106809) Change the "fixed encoding" table used for encoding intrinsic type signature to use 16-bit encoding as opposed to 32-bit. This results in both space and time improvements. For space, the total static storage size (in bytes) of this info reduces by 50%: - Current = 14193*4 (Fixed table) + 16058 + 3 (Long Table) = 72833 - New size = 14193*2 (Fixed table) + 19879 + 3 (Long Table) = 48268. - Reduction = 50.9% For time, with the added benchmark, we see a 7.3% speedup in `GetIntrinsicInfoTableEntries` benchmark. Actual output of the benchmark in included in the GitHub MR. --- llvm/benchmarks/CMakeLists.txt | 1 + .../GetIntrinsicInfoTableEntriesBM.cpp | 30 +++++++ llvm/lib/IR/Function.cpp | 14 ++-- llvm/utils/TableGen/IntrinsicEmitter.cpp | 80 +++++++++++-------- 4 files changed, 87 insertions(+), 38 deletions(-) create mode 100644 llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index e3366e6f3ffe1..aa0cb77773344 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -6,3 +6,4 @@ add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) +add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED) diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp new file mode 100644 index 0000000000000..7f3bd3bc9eb6b --- /dev/null +++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp @@ -0,0 +1,30 @@ +//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace Intrinsic; + +static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) { + SmallVector Table; + for (auto _ : state) { + for (ID ID = 1; ID < num_intrinsics; ++ID) { + // This makes sure the vector does not keep growing, as well as after the + // first iteration does not result in additional allocations. + Table.clear(); + getIntrinsicInfoTableEntries(ID, Table); + } + } +} + +BENCHMARK(BM_GetIntrinsicInfoTableEntries); + +BENCHMARK_MAIN(); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 69520fdb03dc7..afef8930669e8 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1381,22 +1381,24 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, void Intrinsic::getIntrinsicInfoTableEntries(ID id, SmallVectorImpl &T){ + static_assert(sizeof(IIT_Table[0]) == 2, + "Expect 16-bit entries in IIT_Table"); // Check to see if the intrinsic's type was expressible by the table. - unsigned TableVal = IIT_Table[id-1]; + uint16_t TableVal = IIT_Table[id - 1]; // Decode the TableVal into an array of IITValues. - SmallVector IITValues; + SmallVector IITValues; ArrayRef IITEntries; unsigned NextElt = 0; - if ((TableVal >> 31) != 0) { + if (TableVal >> 15) { // This is an offset into the IIT_LongEncodingTable. IITEntries = IIT_LongEncodingTable; // Strip sentinel bit. - NextElt = (TableVal << 1) >> 1; + NextElt = TableVal & 0x7fff; } else { - // Decode the TableVal into an array of IITValues. If the entry was encoded - // into a single word in the table itself, decode it now. + // If the entry was encoded into a single word in the table itself, decode + // it from an array of nibbles to an array of bytes. do { IITValues.push_back(TableVal & 0xF); TableVal >>= 4; diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 09eb1ed5e1863..0f4d7bf8db217 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -282,11 +282,37 @@ static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) { return TypeSig; } +// Pack the type signature into 32-bit fixed encoding word. +static std::optional encodePacked(const TypeSigTy &TypeSig) { + if (TypeSig.size() > 8) + return std::nullopt; + + uint32_t Result = 0; + for (unsigned char C : reverse(TypeSig)) { + if (C > 15) + return std::nullopt; + Result = (Result << 4) | C; + } + return Result; +} + void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS) { - // If we can compute a 32-bit fixed encoding for this intrinsic, do so and + // Note: the code below can be switched to use 32-bit fixed encoding by + // flipping the flag below. + constexpr bool Use16BitFixedEncoding = true; + using FixedEncodingTy = + std::conditional_t; + constexpr unsigned FixedEncodingBits = sizeof(FixedEncodingTy) * CHAR_BIT; + // Mask with all bits 1 except the most significant bit. + const unsigned Mask = (1U << (FixedEncodingBits - 1)) - 1; + const unsigned MSBPostion = FixedEncodingBits - 1; + StringRef FixedEncodingTypeName = + Use16BitFixedEncoding ? "uint16_t" : "uint32_t"; + + // If we can compute a 16/32-bit fixed encoding for this intrinsic, do so and // capture it in this vector, otherwise store a ~0U. - std::vector FixedEncodings; + std::vector FixedEncodings; SequenceToOffsetTable LongEncodingTable; FixedEncodings.reserve(Ints.size()); @@ -296,69 +322,59 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, // Get the signature for the intrinsic. TypeSigTy TypeSig = ComputeTypeSignature(Int); - // Check to see if we can encode it into a 32-bit word. We can only encode - // 8 nibbles into a 32-bit word. - if (TypeSig.size() <= 8) { - // Attempt to pack elements of TypeSig into a 32-bit word, starting from - // the most significant nibble. - unsigned Result = 0; - bool Failed = false; - for (unsigned char C : reverse(TypeSig)) { - if (C > 15) { - Failed = true; - break; - } - Result = (Result << 4) | C; - } - - // If this could be encoded into a 31-bit word, return it. - if (!Failed && (Result >> 31) == 0) { - FixedEncodings.push_back(Result); - continue; - } + // Check to see if we can encode it into a 16/32 bit word. + std::optional Result = encodePacked(TypeSig); + if (Result && (*Result & Mask) == Result) { + FixedEncodings.push_back(static_cast(*Result)); + continue; } - // Otherwise, we're going to unique the sequence into the - // LongEncodingTable, and use its offset in the 32-bit table instead. LongEncodingTable.add(TypeSig); // This is a placehold that we'll replace after the table is laid out. - FixedEncodings.push_back(~0U); + FixedEncodings.push_back(static_cast(~0U)); } LongEncodingTable.layout(); - OS << R"(// Global intrinsic function declaration type table. + OS << formatv(R"(// Global intrinsic function declaration type table. #ifdef GET_INTRINSIC_GENERATOR_GLOBAL -static constexpr unsigned IIT_Table[] = { - )"; +static constexpr {0} IIT_Table[] = {{ + )", + FixedEncodingTypeName); + unsigned MaxOffset = 0; for (auto [Idx, FixedEncoding, Int] : enumerate(FixedEncodings, Ints)) { if ((Idx & 7) == 7) OS << "\n "; // If the entry fit in the table, just emit it. - if (FixedEncoding != ~0U) { + if ((FixedEncoding & Mask) == FixedEncoding) { OS << "0x" << Twine::utohexstr(FixedEncoding) << ", "; continue; } TypeSigTy TypeSig = ComputeTypeSignature(Int); + unsigned Offset = LongEncodingTable.get(TypeSig); + MaxOffset = std::max(MaxOffset, Offset); // Otherwise, emit the offset into the long encoding table. We emit it this // way so that it is easier to read the offset in the .def file. - OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", "; + OS << formatv("(1U<<{0}) | {1}, ", MSBPostion, Offset); } OS << "0\n};\n\n"; + // verify that all offsets will fit in 16/32 bits. + if ((MaxOffset & Mask) != MaxOffset) + PrintFatalError("Offset of long encoding table exceeds encoding bits"); + // Emit the shared table of register lists. OS << "static constexpr unsigned char IIT_LongEncodingTable[] = {\n"; if (!LongEncodingTable.empty()) LongEncodingTable.emit( OS, [](raw_ostream &OS, unsigned char C) { OS << (unsigned)C; }); - OS << " 255\n};\n\n"; - + OS << " 255\n};\n"; OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL } From 660cc98647677815a3f5d97d00220071d8cf7a4f Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 4 Sep 2024 14:58:01 -0700 Subject: [PATCH 160/425] [TableGen] Add `CodeGenIntrinsicsMap` for on-demand intrinsic creation (#107100) - Add class `CodeGenIntrinsicMap` for on-demand creation of `CodeGenIntrinsic`. - Add class `CodeGenIntrinsicContext` to capture global information required to build `CodeGenIntrinsic` objects. - Adopt GlobalISel PatternParser and SearchableTableEmitter to use it. --- .../TableGen/Basic/CodeGenIntrinsics.cpp | 28 +++++++++++++------ llvm/utils/TableGen/Basic/CodeGenIntrinsics.h | 22 +++++++++++++-- .../TableGen/Common/CodeGenDAGPatterns.cpp | 6 ++-- .../Common/GlobalISel/PatternParser.cpp | 11 +++----- .../utils/TableGen/SearchableTableEmitter.cpp | 13 +++++---- 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index 4bca904e9f38b..c0edbf0f01523 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -25,20 +25,20 @@ using namespace llvm; // CodeGenIntrinsic Implementation //===----------------------------------------------------------------------===// -CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { - std::vector IntrProperties = - RC.getAllDerivedDefinitions("IntrinsicProperty"); - - std::vector DefaultProperties; - for (const Record *Rec : IntrProperties) +CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) { + for (const Record *Rec : RC.getAllDerivedDefinitions("IntrinsicProperty")) if (Rec->getValueAsBit("IsDefault")) DefaultProperties.push_back(Rec); +} + +CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { + CodeGenIntrinsicContext Ctx(RC); std::vector Defs = RC.getAllDerivedDefinitions("Intrinsic"); Intrinsics.reserve(Defs.size()); for (const Record *Def : Defs) - Intrinsics.push_back(CodeGenIntrinsic(Def, DefaultProperties)); + Intrinsics.push_back(CodeGenIntrinsic(Def, Ctx)); llvm::sort(Intrinsics, [](const CodeGenIntrinsic &LHS, const CodeGenIntrinsic &RHS) { @@ -54,8 +54,18 @@ CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { Targets.back().Count = Intrinsics.size() - Targets.back().Offset; } +CodeGenIntrinsic &CodeGenIntrinsicMap::operator[](const Record *Record) { + if (!Record->isSubClassOf("Intrinsic")) + PrintFatalError("Intrinsic defs should be subclass of 'Intrinsic' class"); + + auto [Iter, Inserted] = Map.try_emplace(Record); + if (Inserted) + Iter->second = std::make_unique(Record, Ctx); + return *Iter->second; +} + CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, - ArrayRef DefaultProperties) + const CodeGenIntrinsicContext &Ctx) : TheDef(R) { StringRef DefName = TheDef->getName(); ArrayRef DefLoc = R->getLoc(); @@ -119,7 +129,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, } // Set default properties to true. - setDefaultProperties(DefaultProperties); + setDefaultProperties(Ctx.DefaultProperties); // Also record the SDPatternOperator Properties. Properties = parseSDPatternOperatorProperties(R); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 19e29af2fa85f..51c2359155380 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -15,6 +15,7 @@ #include "SDNodeProperties.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/ModRef.h" #include @@ -25,6 +26,12 @@ namespace llvm { class Record; class RecordKeeper; +// Global information needed to build intrinsics. +struct CodeGenIntrinsicContext { + explicit CodeGenIntrinsicContext(const RecordKeeper &RC); + std::vector DefaultProperties; +}; + struct CodeGenIntrinsic { const Record *TheDef; // The actual record defining this intrinsic. std::string Name; // The name of the LLVM function "llvm.bswap.i32" @@ -155,8 +162,7 @@ struct CodeGenIntrinsic { bool isParamImmArg(unsigned ParamIdx) const; - CodeGenIntrinsic(const Record *R, - ArrayRef DefaultProperties = {}); + CodeGenIntrinsic(const Record *R, const CodeGenIntrinsicContext &Ctx); }; class CodeGenIntrinsicTable { @@ -171,7 +177,6 @@ class CodeGenIntrinsicTable { std::vector Targets; explicit CodeGenIntrinsicTable(const RecordKeeper &RC); - CodeGenIntrinsicTable() = default; bool empty() const { return Intrinsics.empty(); } size_t size() const { return Intrinsics.size(); } @@ -182,6 +187,17 @@ class CodeGenIntrinsicTable { return Intrinsics[Pos]; } }; + +// This class builds `CodeGenIntrinsic` on demand for a given Def. +class CodeGenIntrinsicMap { + DenseMap> Map; + const CodeGenIntrinsicContext Ctx; + +public: + explicit CodeGenIntrinsicMap(const RecordKeeper &RC) : Ctx(RC) {} + CodeGenIntrinsic &operator[](const Record *Def); +}; + } // namespace llvm #endif // LLVM_UTILS_TABLEGEN_BASIC_CODEGENINTRINSICS_H diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index a8cecca0d4a54..df3f72ff2ec7f 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -3160,10 +3160,8 @@ void TreePattern::dump() const { print(errs()); } CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R, PatternRewriterFn PatternRewriter) - : Records(R), Target(R), LegalVTS(Target.getLegalValueTypes()), - PatternRewriter(PatternRewriter) { - - Intrinsics = CodeGenIntrinsicTable(Records); + : Records(R), Target(R), Intrinsics(R), + LegalVTS(Target.getLegalValueTypes()), PatternRewriter(PatternRewriter) { ParseNodeInfo(); ParseNodeTransforms(); ParseComplexPatterns(); diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp index 82fa3c8f3121f..73b6097554eda 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp @@ -107,13 +107,10 @@ getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) { static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) { // Intrinsics need to have a static lifetime because the match table keeps // references to CodeGenIntrinsic objects. - static DenseMap> - AllIntrinsics; - - auto &Ptr = AllIntrinsics[R]; - if (!Ptr) - Ptr = std::make_unique(R); - return Ptr.get(); + static CodeGenIntrinsicMap *AllIntrinsics; + if (!AllIntrinsics) + AllIntrinsics = new CodeGenIntrinsicMap(R->getRecords()); + return &(*AllIntrinsics)[R]; } std::unique_ptr diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp index 59ae3bf3daedc..8d394f8051ced 100644 --- a/llvm/utils/TableGen/SearchableTableEmitter.cpp +++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp @@ -94,7 +94,7 @@ struct GenericTable { class SearchableTableEmitter { RecordKeeper &Records; std::unique_ptr Target; - DenseMap> Intrinsics; + std::unique_ptr Intrinsics; std::vector> Enums; DenseMap EnumMap; std::set PreprocessorGuards; @@ -146,10 +146,13 @@ class SearchableTableEmitter { } CodeGenIntrinsic &getIntrinsic(Init *I) { - std::unique_ptr &Intr = Intrinsics[I]; - if (!Intr) - Intr = std::make_unique(cast(I)->getDef()); - return *Intr; + const Record *Def = cast(I)->getDef(); + // Build the Intrinsics map on demand. If we instantiate one in the + // constructor, we may get errors if the TableGen file being processed does + // not include Intrinsics.td and does not do anything with intrinsics. + if (!Intrinsics) + Intrinsics = std::make_unique(Records); + return (*Intrinsics)[Def]; } bool compareBy(Record *LHS, Record *RHS, const SearchIndex &Index); From 98c6bbfe1f3a348633e5e4c192a0134891fe3849 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 4 Sep 2024 15:04:10 -0700 Subject: [PATCH 161/425] [TableGen] Refactor Intrinsics record (#106986) Eliminate unused `isTarget` field in Intrinsic record. Eliminate `isOverloaded`, `Types` and `TypeSig` fields from the record, as they are already available through the `TypeInfo` field. Change intrinsic emitter code to look for this info using fields of the `TypeInfo` record attached to the `Intrinsic` record. Fix several intrinsic related unit tests to source the `Intrinsic` class def from Intrinsics.td as opposed to defining a skeleton in the test. This eliminates some duplication of information in the Intrinsic class, as well as reduces the memory allocated for record fields, resulting in ~2% reduction (though that's not the main goal). --- llvm/include/llvm/IR/Intrinsics.td | 5 -- llvm/test/TableGen/intrinsic-attrs.td | 52 +------------------ llvm/test/TableGen/intrinsic-long-name.td | 36 ++----------- llvm/test/TableGen/intrinsic-struct.td | 37 ++----------- .../TableGen/searchabletables-intrinsic.td | 37 ++----------- .../TableGen/Basic/CodeGenIntrinsics.cpp | 23 ++++---- llvm/utils/TableGen/IntrinsicEmitter.cpp | 11 ++-- 7 files changed, 35 insertions(+), 166 deletions(-) diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 232d6be1073f4..1bc895eee60f1 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -669,12 +669,7 @@ class Intrinsic ret_types, // IntrinsicProperty<1> bit DisableDefaultAttributes = disable_default_attributes; - bit isTarget = false; - TypeInfoGen TypeInfo = TypeInfoGen; - bit isOverloaded = TypeInfo.isOverloaded; - list Types = TypeInfo.Types; - list> TypeSig = TypeInfo.TypeSig; } // Intrinsic with default attributes (disable_default_attributes = false). diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td index 29e8cb1e89bb0..3228b32405103 100644 --- a/llvm/test/TableGen/intrinsic-attrs.td +++ b/llvm/test/TableGen/intrinsic-attrs.td @@ -1,54 +1,6 @@ -// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s -// Get the minimum blurb necessary to process ... -include "llvm/CodeGen/ValueTypes.td" -include "llvm/CodeGen/SDNodeProperties.td" - -class LLVMType { - ValueType VT = vt; - int isAny = 0; -} - -def llvm_i32_ty : LLVMType; -def llvm_ptr_ty : LLVMType; - -class AttrIndex { - int Value = idx; -} - -def FuncIndex : AttrIndex<-1>; -def RetIndex : AttrIndex<0>; -class ArgIndex : AttrIndex; - -class IntrinsicProperty { - bit IsDefault = is_default; -} - -def IntrNoMem : IntrinsicProperty; -def IntrHasSideEffects : IntrinsicProperty; -class Dereferenceable : IntrinsicProperty { - int ArgNo = idx.Value; - int Bytes = bytes; -} - -class Intrinsic ret_types, - list param_types = [], - list intr_properties = [], - string name = "", - list sd_properties = [], - bit disable_default_attributes = 0> : SDPatternOperator { - string LLVMName = name; - string TargetPrefix = ""; - list RetTypes = ret_types; - list ParamTypes = param_types; - list IntrProperties = intr_properties; - let Properties = sd_properties; - bit DisableDefaultAttributes = 1; - - - bit isTarget = 0; - bit DisableDefaultAttributes = disable_default_attributes; -} +include "llvm/IR/Intrinsics.td" // ... this intrinsic. def int_random_gen : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>; diff --git a/llvm/test/TableGen/intrinsic-long-name.td b/llvm/test/TableGen/intrinsic-long-name.td index d66173202302b..c19910d474ed1 100644 --- a/llvm/test/TableGen/intrinsic-long-name.td +++ b/llvm/test/TableGen/intrinsic-long-name.td @@ -1,38 +1,10 @@ -// RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s +// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s // XFAIL: vg_leak -class IntrinsicProperty { - bit IsDefault = is_default; -} - -class SDNodeProperty; - -class ValueType { - string Namespace = "MVT"; - int Size = size; - int Value = value; -} - -class LLVMType { - ValueType VT = vt; -} - -class Intrinsic param_types = []> { - string LLVMName = name; - bit isTarget = 0; - string TargetPrefix = ""; - list RetTypes = []; - list ParamTypes = param_types; - list IntrProperties = []; - list Properties = []; - bit DisableDefaultAttributes = 1; -} - -def iAny : ValueType<0, 253>; -def llvm_anyint_ty : LLVMType; +include "llvm/IR/Intrinsics.td" // Make sure we generate the long name without crashing // CHECK: this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash, // llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash -def int_foo : Intrinsic<"llvm.foo", [llvm_anyint_ty]>; -def int_this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash : Intrinsic<"llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash", [llvm_anyint_ty]>; +def int_foo : Intrinsic<[llvm_anyint_ty], [], [], "llvm.foo">; +def int_this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash : Intrinsic<[llvm_anyint_ty], [], [], "llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash">; diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td index bc044a4a6f858..f23a7a7643af2 100644 --- a/llvm/test/TableGen/intrinsic-struct.td +++ b/llvm/test/TableGen/intrinsic-struct.td @@ -1,38 +1,11 @@ -// RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s +// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s // XFAIL: vg_leak -class IntrinsicProperty { - bit IsDefault = is_default; -} - -class SDNodeProperty; - -class ValueType { - string Namespace = "MVT"; - int Size = size; - int Value = value; -} - -class LLVMType { - ValueType VT = vt; -} - -class Intrinsic ret_types = []> { - string LLVMName = name; - bit isTarget = 0; - string TargetPrefix = ""; - list RetTypes = ret_types; - list ParamTypes = []; - list IntrProperties = []; - list Properties = []; - bit DisableDefaultAttributes = 1; -} - -def iAny : ValueType<0, 253>; -def llvm_anyint_ty : LLVMType; +include "llvm/IR/Intrinsics.td" // Make sure we can return up to 8 values // CHECK: returns_8_results = {{[0-9]+}}, // llvm.returns.8.results -def int_returns_8_results : Intrinsic<"llvm.returns.8.results", +def int_returns_8_results : Intrinsic< [llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, - llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty]>; + llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty], + [], [], "llvm.returns.8.results">; diff --git a/llvm/test/TableGen/searchabletables-intrinsic.td b/llvm/test/TableGen/searchabletables-intrinsic.td index 75722d19b16e9..d4ec105f0243b 100644 --- a/llvm/test/TableGen/searchabletables-intrinsic.td +++ b/llvm/test/TableGen/searchabletables-intrinsic.td @@ -1,48 +1,19 @@ -// RUN: llvm-tblgen -gen-searchable-tables -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-searchable-tables -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s // XFAIL: vg_leak include "llvm/TableGen/SearchableTable.td" - -class IntrinsicProperty { - bit IsDefault = is_default; -} - -class SDNodeProperty; - -class ValueType { - string Namespace = "MVT"; - int Size = size; - int Value = value; -} - -class LLVMType { - ValueType VT = vt; -} - -class Intrinsic param_types = []> { - string LLVMName = ""; - bit isTarget = 0; - string TargetPrefix = ""; - list RetTypes = []; - list ParamTypes = param_types; - list IntrProperties = []; - list Properties = []; - bit DisableDefaultAttributes = 1; -} - -def iAny : ValueType<0, 253>; -def llvm_anyint_ty : LLVMType; +include "llvm/IR/Intrinsics.td" def int_abc : Intrinsic<[llvm_anyint_ty]>; def int_xyz : Intrinsic<[llvm_anyint_ty]>; -let isTarget = 1, TargetPrefix = "gtarget" in { +let TargetPrefix = "gtarget" in { def int_gtarget_def : Intrinsic<[llvm_anyint_ty]>; def int_gtarget_defg : Intrinsic<[llvm_anyint_ty]>; def int_gtarget_uvw : Intrinsic<[llvm_anyint_ty]>; } -let isTarget = 1, TargetPrefix = "ftarget" in { +let TargetPrefix = "ftarget" in { def int_ftarget_ghi : Intrinsic<[llvm_anyint_ty]>; def int_ftarget_ghi_x : Intrinsic<[llvm_anyint_ty]>; def int_ftarget_rst : Intrinsic<[llvm_anyint_ty]>; diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index c0edbf0f01523..a30a7577408f8 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -106,17 +106,22 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, TargetPrefix + ".'!"); } - if (auto *Types = R->getValue("Types")) { - auto *TypeList = cast(Types->getValue()); - isOverloaded = R->getValueAsBit("isOverloaded"); + const Record *TypeInfo = R->getValueAsDef("TypeInfo"); + if (!TypeInfo->isSubClassOf("TypeInfoGen")) + PrintFatalError(DefLoc, "TypeInfo field in " + DefName + + " should be of subclass of TypeInfoGen!"); - unsigned I = 0; - for (unsigned E = R->getValueAsListInit("RetTypes")->size(); I < E; ++I) - IS.RetTys.push_back(TypeList->getElementAsRecord(I)); + isOverloaded = TypeInfo->getValueAsBit("isOverloaded"); + const ListInit *TypeList = TypeInfo->getValueAsListInit("Types"); - for (unsigned E = TypeList->size(); I < E; ++I) - IS.ParamTys.push_back(TypeList->getElementAsRecord(I)); - } + // Types field is a concatenation of Return types followed by Param types. + unsigned Idx = 0; + unsigned NumRet = R->getValueAsListInit("RetTypes")->size(); + for (; Idx < NumRet; ++Idx) + IS.RetTys.push_back(TypeList->getElementAsRecord(Idx)); + + for (unsigned E = TypeList->size(); Idx < E; ++Idx) + IS.ParamTys.push_back(TypeList->getElementAsRecord(Idx)); // Parse the intrinsic properties. ListInit *PropList = R->getValueAsListInit("IntrProperties"); diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 0f4d7bf8db217..bda97c61d3d58 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -273,11 +273,12 @@ using TypeSigTy = SmallVector; /// Computes type signature of the intrinsic \p Int. static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) { TypeSigTy TypeSig; - if (const auto *R = Int.TheDef->getValue("TypeSig")) { - for (const auto *a : cast(R->getValue())->getValues()) { - for (const auto *b : cast(a)->getValues()) - TypeSig.emplace_back(cast(b)->getValue()); - } + const Record *TypeInfo = Int.TheDef->getValueAsDef("TypeInfo"); + const ListInit *OuterList = TypeInfo->getValueAsListInit("TypeSig"); + + for (const auto *Outer : OuterList->getValues()) { + for (const auto *Inner : cast(Outer)->getValues()) + TypeSig.emplace_back(cast(Inner)->getValue()); } return TypeSig; } From df50751d24da4f5fdf8f46119c09a7e941f7174b Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 4 Sep 2024 15:12:58 -0700 Subject: [PATCH 162/425] [SandboxIR] Implement ConstantAggregateZero (#107172) This patch implements sandboxir::ConstantAggregateZero mirroring llvm::ConstantAggregateZero. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 44 +++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/include/llvm/SandboxIR/Type.h | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 78 +++++++++++++++---- llvm/lib/SandboxIR/Type.cpp | 5 ++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 69 ++++++++++++++++ llvm/unittests/SandboxIR/TypesTest.cpp | 4 + 7 files changed, 186 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 89e963498426d..89bc9c46581fc 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -120,6 +120,7 @@ namespace sandboxir { class BasicBlock; class ConstantInt; class ConstantFP; +class ConstantAggregateZero; class Context; class Function; class Instruction; @@ -316,6 +317,7 @@ class Value { friend class CmpInst; // For getting `Val`. friend class ConstantArray; // For `Val`. friend class ConstantStruct; // For `Val`. + friend class ConstantAggregateZero; // For `Val`. /// All values point to the context. Context &Ctx; @@ -943,6 +945,48 @@ class ConstantVector final : public ConstantAggregate { } }; +// TODO: Inherit from ConstantData. +class ConstantAggregateZero final : public Constant { + ConstantAggregateZero(llvm::ConstantAggregateZero *C, Context &Ctx) + : Constant(ClassID::ConstantAggregateZero, C, Ctx) {} + friend class Context; // For constructor. + +public: + static ConstantAggregateZero *get(Type *Ty); + /// If this CAZ has array or vector type, return a zero with the right element + /// type. + Constant *getSequentialElement() const; + /// If this CAZ has struct type, return a zero with the right element type for + /// the specified element. + Constant *getStructElement(unsigned Elt) const; + /// Return a zero of the right value for the specified GEP index if we can, + /// otherwise return null (e.g. if C is a ConstantExpr). + Constant *getElementValue(Constant *C) const; + /// Return a zero of the right value for the specified GEP index. + Constant *getElementValue(unsigned Idx) const; + /// Return the number of elements in the array, vector, or struct. + ElementCount getElementCount() const { + return cast(Val)->getElementCount(); + } + + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::ConstantAggregateZero; + } + unsigned getUseOperandNo(const Use &Use) const final { + llvm_unreachable("ConstantAggregateZero has no operands!"); + } +#ifndef NDEBUG + void verify() const override { + assert(isa(Val) && "Expected a CAZ!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index f320f61934efa..b2180ba58afcc 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -30,6 +30,7 @@ DEF_CONST(ConstantFP, ConstantFP) DEF_CONST(ConstantArray, ConstantArray) DEF_CONST(ConstantStruct, ConstantStruct) DEF_CONST(ConstantVector, ConstantVector) +DEF_CONST(ConstantAggregateZero, ConstantAggregateZero) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 61721ca836321..69ca156e82101 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -293,6 +293,7 @@ class PointerType : public Type { class ArrayType : public Type { public: + static ArrayType *get(Type *ElementType, uint64_t NumElements); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index c0e5837209213..89bbdf575c245 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2402,6 +2402,30 @@ StructType *ConstantStruct::getTypeForElements(Context &Ctx, return StructType::get(Ctx, EltTypes, Packed); } +ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) { + auto *LLVMC = llvm::ConstantAggregateZero::get(Ty->LLVMTy); + return cast( + Ty->getContext().getOrCreateConstant(LLVMC)); +} + +Constant *ConstantAggregateZero::getSequentialElement() const { + return cast(Ctx.getValue( + cast(Val)->getSequentialElement())); +} +Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const { + return cast(Ctx.getValue( + cast(Val)->getStructElement(Elt))); +} +Constant *ConstantAggregateZero::getElementValue(Constant *C) const { + return cast( + Ctx.getValue(cast(Val)->getElementValue( + cast(C->Val)))); +} +Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const { + return cast(Ctx.getValue( + cast(Val)->getElementValue(Idx))); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2489,26 +2513,48 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { return It->second.get(); if (auto *C = dyn_cast(LLVMV)) { - if (auto *CI = dyn_cast(C)) { - It->second = std::unique_ptr(new ConstantInt(CI, *this)); + switch (C->getValueID()) { + case llvm::Value::ConstantIntVal: + It->second = std::unique_ptr( + new ConstantInt(cast(C), *this)); return It->second.get(); - } - if (auto *CF = dyn_cast(C)) { - It->second = std::unique_ptr(new ConstantFP(CF, *this)); + case llvm::Value::ConstantFPVal: + It->second = std::unique_ptr( + new ConstantFP(cast(C), *this)); return It->second.get(); + case llvm::Value::ConstantAggregateZeroVal: { + auto *CAZ = cast(C); + It->second = std::unique_ptr( + new ConstantAggregateZero(CAZ, *this)); + auto *Ret = It->second.get(); + // Must create sandboxir for elements. + auto EC = CAZ->getElementCount(); + if (EC.isFixed()) { + for (auto ElmIdx : seq(0, EC.getFixedValue())) + getOrCreateValueInternal(CAZ->getElementValue(ElmIdx), CAZ); + } + return Ret; } - if (auto *CA = dyn_cast(C)) - It->second = std::unique_ptr(new ConstantArray(CA, *this)); - else if (auto *CS = dyn_cast(C)) - It->second = - std::unique_ptr(new ConstantStruct(CS, *this)); - else if (auto *CV = dyn_cast(C)) - It->second = - std::unique_ptr(new ConstantVector(CV, *this)); - else if (auto *F = dyn_cast(LLVMV)) - It->second = std::unique_ptr(new Function(F, *this)); - else + case llvm::Value::ConstantArrayVal: + It->second = std::unique_ptr( + new ConstantArray(cast(C), *this)); + break; + case llvm::Value::ConstantStructVal: + It->second = std::unique_ptr( + new ConstantStruct(cast(C), *this)); + break; + case llvm::Value::ConstantVectorVal: + It->second = std::unique_ptr( + new ConstantVector(cast(C), *this)); + break; + case llvm::Value::FunctionVal: + It->second = std::unique_ptr( + new Function(cast(C), *this)); + break; + default: It->second = std::unique_ptr(new Constant(C, *this)); + break; + } auto *NewC = It->second.get(); for (llvm::Value *COp : C->operands()) getOrCreateValueInternal(COp, C); diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp index 535b0f75fd874..11a16e865213f 100644 --- a/llvm/lib/SandboxIR/Type.cpp +++ b/llvm/lib/SandboxIR/Type.cpp @@ -47,6 +47,11 @@ PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) { Ctx.getType(llvm::PointerType::get(Ctx.LLVMCtx, AddressSpace))); } +ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) { + return cast(ElementType->getContext().getType( + llvm::ArrayType::get(ElementType->LLVMTy, NumElements))); +} + StructType *StructType::get(Context &Ctx, ArrayRef Elements, bool IsPacked) { SmallVector LLVMElements; diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index d1c5690ccad5b..ebb127915ba85 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -520,6 +520,75 @@ define void @foo() { EXPECT_EQ(StructTy2Packed, StructTyPacked); } +TEST_F(SandboxIRTest, ConstantAggregateZero) { + parseIR(C, R"IR( +define void @foo(ptr %ptr, {i32, i8} %v1, <2 x i8> %v2) { + %extr0 = extractvalue [2 x i8] zeroinitializer, 0 + %extr1 = extractvalue {i32, i8} zeroinitializer, 0 + %extr2 = extractelement <2 x i8> zeroinitializer, i32 0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = BB.begin(); + auto *Extr0 = &*It++; + auto *Extr1 = &*It++; + auto *Extr2 = &*It++; + [[maybe_unused]] auto *Ret = cast(&*It++); + auto *Zero32 = + sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0); + auto *Zero8 = sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0); + auto *Int8Ty = sandboxir::Type::getInt8Ty(Ctx); + auto *Int32Ty = sandboxir::Type::getInt32Ty(Ctx); + auto *ArrayTy = sandboxir::ArrayType::get(Int8Ty, 2u); + auto *StructTy = sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty}); + auto *VectorTy = + sandboxir::VectorType::get(Int8Ty, ElementCount::getFixed(2u)); + + // Check creation and classof(). + auto *ArrayCAZ = cast(Extr0->getOperand(0)); + EXPECT_EQ(ArrayCAZ->getType(), ArrayTy); + auto *StructCAZ = + cast(Extr1->getOperand(0)); + EXPECT_EQ(StructCAZ->getType(), StructTy); + auto *VectorCAZ = + cast(Extr2->getOperand(0)); + EXPECT_EQ(VectorCAZ->getType(), VectorTy); + // Check get(). + auto *SameVectorCAZ = + sandboxir::ConstantAggregateZero::get(sandboxir::VectorType::get( + sandboxir::Type::getInt8Ty(Ctx), ElementCount::getFixed(2))); + EXPECT_EQ(SameVectorCAZ, VectorCAZ); // Should be uniqued. + auto *NewVectorCAZ = + sandboxir::ConstantAggregateZero::get(sandboxir::VectorType::get( + sandboxir::Type::getInt8Ty(Ctx), ElementCount::getFixed(4))); + EXPECT_NE(NewVectorCAZ, VectorCAZ); + // Check getSequentialElement(). + auto *SeqElm = VectorCAZ->getSequentialElement(); + EXPECT_EQ(SeqElm, + sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0)); + // Check getStructElement(). + auto *StructElm0 = StructCAZ->getStructElement(0); + auto *StructElm1 = StructCAZ->getStructElement(1); + EXPECT_EQ(StructElm0, Zero32); + EXPECT_EQ(StructElm1, Zero8); + // Check getElementValue(Constant). + EXPECT_EQ(ArrayCAZ->getElementValue(Zero32), Zero8); + EXPECT_EQ(StructCAZ->getElementValue(Zero32), Zero32); + EXPECT_EQ(VectorCAZ->getElementValue(Zero32), Zero8); + // Check getElementValue(unsigned). + EXPECT_EQ(ArrayCAZ->getElementValue(0u), Zero8); + EXPECT_EQ(StructCAZ->getElementValue(0u), Zero32); + EXPECT_EQ(VectorCAZ->getElementValue(0u), Zero8); + // Check getElementCount(). + EXPECT_EQ(ArrayCAZ->getElementCount(), ElementCount::getFixed(2)); + EXPECT_EQ(NewVectorCAZ->getElementCount(), ElementCount::getFixed(4)); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp index d4c2de441268c..36ef0cf8e5291 100644 --- a/llvm/unittests/SandboxIR/TypesTest.cpp +++ b/llvm/unittests/SandboxIR/TypesTest.cpp @@ -236,6 +236,10 @@ define void @foo([2 x i8] %v0) { // Check classof(), creation. [[maybe_unused]] auto *ArrayTy = cast(F->getArg(0)->getType()); + // Check get(). + auto *NewArrayTy = + sandboxir::ArrayType::get(sandboxir::Type::getInt8Ty(Ctx), 2u); + EXPECT_EQ(NewArrayTy, ArrayTy); } TEST_F(SandboxTypeTest, StructType) { From 9171881d64e4834de7ad7c9807607ce6bc5167a9 Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Wed, 4 Sep 2024 22:19:11 +0000 Subject: [PATCH 163/425] [AMDGPU][Docs] DWARF aspace-aware base types (post-review fixes) --- .../AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst | 2 +- llvm/docs/AMDGPUUsage.rst | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst index b71e80f922250..0249c580964a0 100644 --- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst +++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst @@ -3807,7 +3807,7 @@ A.5.1 Base Type Entries 2. A ``DW_TAG_base_type`` debugger information entry with the encoding ``DW_ATE_address`` may have a ``DW_AT_LLVM_address_space`` attribute whose value is an architecture specific address space (see - :ref:`amdgpu-dwarf-address-spaces`). If ommitted it defaults to + :ref:`amdgpu-dwarf-address-spaces`). If omitted it defaults to ``DW_ASPACE_LLVM_none``. .. _amdgpu-dwarf-type-modifier-entries: diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index a5ad3f6bbf7b2..ba62a68c4a509 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -2938,13 +2938,15 @@ error. or from that address space is always an evaluation error. For targets which support the generic address space, converting from - ``DW_ASPACE_AMDGPU_generic`` to ``DW_ASPACE_LLVM_none`` is always defined and - requires no change to the literal value of the address. + ``DW_ASPACE_AMDGPU_generic`` to ``DW_ASPACE_LLVM_none`` is defined when the + generic address is in the global address space. The conversion requires no + change to the literal value of the address. Converting from ``DW_ASPACE_AMDGPU_generic`` to any of ``DW_ASPACE_AMDGPU_local``, ``DW_ASPACE_AMDGPU_private_wave`` or ``DW_ASPACE_AMDGPU_private_lane`` is defined when the relevant hardware - support is present and setup has been completed. Conversion to + support is present, any required hardware setup has been completed, and the + generic address is in the corresponding address space. Conversion to ``DW_ASPACE_AMDGPU_private_lane`` additionally requires the context to include the active lane. From 7c4eb60c9509c3a750961eac2dbcaad369d911f2 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 4 Sep 2024 18:43:54 -0400 Subject: [PATCH 164/425] [Clang] Fix CLANG_TOOLCHAIN_PROGRAM_TIMEOUT logic PR #102521, which landed as 1ea0865dd6fa, implemented `CLANG_TOOLCHAIN_PROGRAM_TIMEOUT`, but the logic is obviously wrong. If the user-specified value is negative, it should become zero to mean infinite. Otherwise, it should be left as is. Thus, use `std::max` not `std::min`. This obvious fixup doesn't seem worth another pull request. --- clang/lib/Driver/ToolChain.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 64f23d43e87ee..16f9b629fc538 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -126,7 +126,7 @@ ToolChain::executeToolChainProgram(StringRef Executable) const { "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected " "an integer, got '" + *Str + "'"); - SecondsToWait = std::min(SecondsToWait, 0); // infinite + SecondsToWait = std::max(SecondsToWait, 0); // infinite } if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait, /*MemoryLimit=*/0, &ErrorMessage)) From 950bb68516eb564c29815997450bdb6516ffdcec Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 4 Sep 2024 15:55:08 -0700 Subject: [PATCH 165/425] [SandboxIR] Implement ConstantPointerNull (#107320) This patch implements sandboxir::ConstantPointerNull mirroring llvm::ConstantPointerNull. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 31 +++++++++++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 15 +++++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 27 ++++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 89bc9c46581fc..4db4fae24b4b3 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -121,6 +121,7 @@ class BasicBlock; class ConstantInt; class ConstantFP; class ConstantAggregateZero; +class ConstantPointerNull; class Context; class Function; class Instruction; @@ -318,6 +319,7 @@ class Value { friend class ConstantArray; // For `Val`. friend class ConstantStruct; // For `Val`. friend class ConstantAggregateZero; // For `Val`. + friend class ConstantPointerNull; // For `Val`. /// All values point to the context. Context &Ctx; @@ -987,6 +989,35 @@ class ConstantAggregateZero final : public Constant { #endif }; +// TODO: Inherit from ConstantData. +class ConstantPointerNull final : public Constant { + ConstantPointerNull(llvm::ConstantPointerNull *C, Context &Ctx) + : Constant(ClassID::ConstantPointerNull, C, Ctx) {} + friend class Context; // For constructor. + +public: + static ConstantPointerNull *get(PointerType *Ty); + + PointerType *getType() const; + + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::ConstantPointerNull; + } + unsigned getUseOperandNo(const Use &Use) const final { + llvm_unreachable("ConstantPointerNull has no operands!"); + } +#ifndef NDEBUG + void verify() const override { + assert(isa(Val) && "Expected a CPNull!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index b2180ba58afcc..fce5aacc8c86d 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -31,6 +31,7 @@ DEF_CONST(ConstantArray, ConstantArray) DEF_CONST(ConstantStruct, ConstantStruct) DEF_CONST(ConstantVector, ConstantVector) DEF_CONST(ConstantAggregateZero, ConstantAggregateZero) +DEF_CONST(ConstantPointerNull, ConstantPointerNull) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 89bbdf575c245..acbc5c17ab256 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2426,6 +2426,17 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const { cast(Val)->getElementValue(Idx))); } +ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) { + auto *LLVMC = + llvm::ConstantPointerNull::get(cast(Ty->LLVMTy)); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} + +PointerType *ConstantPointerNull::getType() const { + return cast( + Ctx.getType(cast(Val)->getType())); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2535,6 +2546,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { } return Ret; } + case llvm::Value::ConstantPointerNullVal: + It->second = std::unique_ptr( + new ConstantPointerNull(cast(C), *this)); + return It->second.get(); case llvm::Value::ConstantArrayVal: It->second = std::unique_ptr( new ConstantArray(cast(C), *this)); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index ebb127915ba85..2f5ef92578e77 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -589,6 +589,33 @@ define void @foo(ptr %ptr, {i32, i8} %v1, <2 x i8> %v2) { EXPECT_EQ(NewVectorCAZ->getElementCount(), ElementCount::getFixed(4)); } +TEST_F(SandboxIRTest, ConstantPointerNull) { + parseIR(C, R"IR( +define ptr @foo() { + ret ptr null +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = BB.begin(); + auto *Ret = cast(&*It++); + // Check classof() and creation. + auto *CPNull = cast(Ret->getReturnValue()); + // Check get(). + auto *NewCPNull = + sandboxir::ConstantPointerNull::get(sandboxir::PointerType::get(Ctx, 0u)); + EXPECT_EQ(NewCPNull, CPNull); + auto *NewCPNull2 = + sandboxir::ConstantPointerNull::get(sandboxir::PointerType::get(Ctx, 1u)); + EXPECT_NE(NewCPNull2, CPNull); + // Check getType(). + EXPECT_EQ(CPNull->getType(), sandboxir::PointerType::get(Ctx, 0u)); + EXPECT_EQ(NewCPNull2->getType(), sandboxir::PointerType::get(Ctx, 1u)); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { From 9efe377307694be0c92f7cb3b02fd1d090fdbeb8 Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Wed, 4 Sep 2024 16:03:13 -0700 Subject: [PATCH 166/425] [HLSL] Implement '__builtin_hlsl_is_intangible' type trait (#104544) Implements `__builtin_hlsl_is_intangible` type trait. HLSL intangible types are special implementation-defined types such as resource handles or samplers. Any class that is an array of intangible type or contains base class or members of intangible types is also an intangible type. Fixes #[102954](https://github.com/llvm/llvm-project/issues/102954) --- .../clang/AST/CXXRecordDeclDefinitionBits.def | 4 + clang/include/clang/AST/DeclCXX.h | 4 + clang/include/clang/AST/Type.h | 7 ++ clang/include/clang/Basic/TokenKinds.def | 3 +- clang/include/clang/Sema/SemaHLSL.h | 1 + clang/lib/AST/DeclCXX.cpp | 17 +++- clang/lib/Sema/SemaExprCXX.cpp | 11 +++ clang/lib/Sema/SemaHLSL.cpp | 28 +++++++ .../Types/Traits/IsIntangibleType.hlsl | 78 +++++++++++++++++++ .../Types/Traits/IsIntangibleTypeErrors.hlsl | 11 +++ 10 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl create mode 100644 clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl diff --git a/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def b/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def index cdf0804680ad0..6620840df0ced 100644 --- a/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def +++ b/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def @@ -249,4 +249,8 @@ FIELD(HasDeclaredCopyAssignmentWithConstParam, 1, MERGE_OR) /// base classes or fields have a no-return destructor FIELD(IsAnyDestructorNoReturn, 1, NO_MERGE) +/// Whether the record type is intangible (if any base classes or fields have +/// type that is intangible). HLSL only. +FIELD(IsHLSLIntangible, 1, NO_MERGE) + #undef FIELD diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 0d72cc6a08dcb..252e6e9256414 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -1547,6 +1547,10 @@ class CXXRecordDecl : public RecordDecl { /// destructors are marked noreturn. bool isAnyDestructorNoReturn() const { return data().IsAnyDestructorNoReturn; } + /// Returns true if the class contains HLSL intangible type, either as + /// a field or in base class. + bool isHLSLIntangible() const { return data().IsHLSLIntangible; } + /// If the class is a local class [class.local], returns /// the enclosing function declaration. const FunctionDecl *isLocalClass() const { diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 853226118af40..ef36a73716454 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2658,6 +2658,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) bool is##Id##Type() const; #include "clang/Basic/HLSLIntangibleTypes.def" bool isHLSLSpecificType() const; // Any HLSL specific type + bool isHLSLIntangibleType() const; // Any HLSL intangible type /// Determines if this type, which must satisfy /// isObjCLifetimeType(), is implicitly __unsafe_unretained rather @@ -8341,6 +8342,12 @@ inline bool Type::isHLSLSpecificType() const { false; // end boolean or operation } +inline bool Type::isHLSLIntangibleType() const { + // All HLSL specific types are currently intangible type as well, but that + // might change in the future. + return isHLSLSpecificType(); +} + inline bool Type::isTemplateTypeParmType() const { return isa(CanonicalType); } diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 212c1f6ff3a12..a82ff684b2ac7 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -660,8 +660,9 @@ KEYWORD(out , KEYHLSL) #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) KEYWORD(Name, KEYHLSL) #include "clang/Basic/HLSLIntangibleTypes.def" -// HLSL Type traits. +// HLSL Type traits TYPE_TRAIT_2(__builtin_hlsl_is_scalarized_layout_compatible, IsScalarizedLayoutCompatible, KEYHLSL) +TYPE_TRAIT_1(__builtin_hlsl_is_intangible, IsIntangibleType, KEYHLSL) // OpenMP Type Traits UNARY_EXPR_OR_TYPE_TRAIT(__builtin_omp_required_simd_align, OpenMPRequiredSimdAlign, KEYALL) diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index d79ca9a4fa18d..285e4e5f3c765 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -71,6 +71,7 @@ class SemaHLSL : public SemaBase { // HLSL Type trait implementations bool IsScalarizedLayoutCompatible(QualType T1, QualType T2) const; + bool IsIntangibleType(QualType T1); bool CheckCompatibleParameterABI(FunctionDecl *New, FunctionDecl *Old); diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 9a3ede426e914..01143391edab4 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -109,7 +109,7 @@ CXXRecordDecl::DefinitionData::DefinitionData(CXXRecordDecl *D) ImplicitCopyAssignmentHasConstParam(true), HasDeclaredCopyConstructorWithConstParam(false), HasDeclaredCopyAssignmentWithConstParam(false), - IsAnyDestructorNoReturn(false), IsLambda(false), + IsAnyDestructorNoReturn(false), IsHLSLIntangible(false), IsLambda(false), IsParsingBaseSpecifiers(false), ComputedVisibleConversions(false), HasODRHash(false), Definition(D) {} @@ -431,6 +431,9 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases, if (BaseClassDecl->isAnyDestructorNoReturn()) data().IsAnyDestructorNoReturn = true; + if (BaseClassDecl->isHLSLIntangible()) + data().IsHLSLIntangible = true; + // C++11 [class.copy]p18: // The implicitly-declared copy assignment operator for a class X will // have the form 'X& X::operator=(const X&)' if each direct base class B @@ -1401,6 +1404,18 @@ void CXXRecordDecl::addedMember(Decl *D) { // than subobjects of zero size if (data().Empty && !IsZeroSize) data().Empty = false; + + if (getLangOpts().HLSL) { + const Type *Ty = Field->getType()->getUnqualifiedDesugaredType(); + while (isa(Ty)) + Ty = Ty->getArrayElementTypeNoTypeQual(); + + Ty = Ty->getUnqualifiedDesugaredType(); + if (Ty->isBuiltinType()) + data().IsHLSLIntangible |= Ty->isHLSLIntangibleType(); + else if (const RecordType *RT = dyn_cast(Ty)) + data().IsHLSLIntangible |= RT->getAsCXXRecordDecl()->isHLSLIntangible(); + } } // Handle using declarations of conversion functions. diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index b7531581d37ff..14feafd1e6b17 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -5111,6 +5111,7 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, case UTT_IsDestructible: case UTT_IsNothrowDestructible: case UTT_IsTriviallyDestructible: + case UTT_IsIntangibleType: if (ArgTy->isIncompleteArrayType() || ArgTy->isVoidType()) return true; @@ -5696,6 +5697,16 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT, return true; return false; } + case UTT_IsIntangibleType: + assert(Self.getLangOpts().HLSL && "intangible types are HLSL-only feature"); + if (!T->isVoidType() && !T->isIncompleteArrayType()) + if (Self.RequireCompleteType(TInfo->getTypeLoc().getBeginLoc(), T, + diag::err_incomplete_type)) + return false; + if (DiagnoseVLAInCXXTypeTrait(Self, TInfo, + tok::kw___builtin_hlsl_is_intangible)) + return false; + return Self.HLSL().IsIntangibleType(T); } } diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 778d524a00548..65aeda4b7b613 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -10,8 +10,11 @@ #include "clang/Sema/SemaHLSL.h" #include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" +#include "clang/AST/DeclCXX.h" #include "clang/AST/Expr.h" #include "clang/AST/RecursiveASTVisitor.h" +#include "clang/AST/Type.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" @@ -1609,6 +1612,31 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return false; } +bool SemaHLSL::IsIntangibleType(clang::QualType QT) { + if (QT.isNull()) + return false; + + const Type *Ty = QT->getUnqualifiedDesugaredType(); + + // check if it's a builtin type first (simple check, no need to cache it) + if (Ty->isBuiltinType()) + return Ty->isHLSLIntangibleType(); + + // unwrap arrays + while (isa(Ty)) + Ty = Ty->getArrayElementTypeNoTypeQual(); + + const RecordType *RT = + dyn_cast(Ty->getUnqualifiedDesugaredType()); + if (!RT) + return false; + + CXXRecordDecl *RD = RT->getAsCXXRecordDecl(); + assert(RD != nullptr && + "all HLSL struct and classes should be CXXRecordDecl"); + return RD->isHLSLIntangible(); +} + static void BuildFlattenedTypeList(QualType BaseTy, llvm::SmallVectorImpl &List) { llvm::SmallVector WorkList; diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl new file mode 100644 index 0000000000000..92cba1dcd4bdf --- /dev/null +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s +// expected-no-diagnostics + +_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), ""); +// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported + +_Static_assert(!__builtin_hlsl_is_intangible(int), ""); +_Static_assert(!__builtin_hlsl_is_intangible(float3), ""); +_Static_assert(!__builtin_hlsl_is_intangible(half[4]), ""); + +typedef __hlsl_resource_t Res; +_Static_assert(__builtin_hlsl_is_intangible(const Res), ""); +// no need to check array of Res, arrays of sizeless types are not supported + +struct ABuffer { + const int i[10]; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(ABuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), ""); + +struct MyStruct { + half2 h2; + int3 i3; +}; +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), ""); +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), ""); + +class MyClass { + int3 ivec; + float farray[12]; + MyStruct ms; + ABuffer buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), ""); + +union U { + double d[4]; + Res buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(U), ""); +_Static_assert(__builtin_hlsl_is_intangible(U[100]), ""); + +class MyClass2 { + int3 ivec; + float farray[12]; + U u; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass2), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), ""); + +class Simple { + int a; +}; + +template struct TemplatedBuffer { + T a; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer), ""); + +struct MyStruct2 : TemplatedBuffer { + float x; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), ""); + +struct MyStruct3 { + const TemplatedBuffer TB[10]; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), ""); + +template struct SimpleTemplate { + T a; +}; +_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); +_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl new file mode 100644 index 0000000000000..0803086749bd7 --- /dev/null +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s + +struct Undefined; // expected-note {{forward declaration of 'Undefined'}} +_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}} + +void fn(int X) { + // expected-error@#vla {{variable length arrays are not supported for the current target}} + // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}} + // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}} + _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla +} From aecbc924102ee57ea639cd76ed32b37eb2d257fc Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 4 Sep 2024 16:14:13 -0700 Subject: [PATCH 167/425] [WebAssembly] Rename CATCH/CATCH_ALL to *_LEGACY (#107187) This renames MIR instruction `CATCH` and `CATCH_ALL` to `CATCH_LEGACY` and `CATCH_ALL_LEGACY` respectively. Follow-up PRs for the new EH (exnref) implementation will use `CATCH`, `CATCH_REF`, `CATCH_ALL`, and `CATCH_ALL_REF` as pseudo-instructions that return extracted values or `exnref` or both, because we don't currently support block return values in LLVM. So to give the old (real) `CATCH`es and the new (pseudo) `CATCH`es different names, this attaches `_LEGACY` prefix to the old names. This also rearranges `WebAssemblyInstrControl.td` so that the old legacy instructions are listed all together at the end. --- .../MCTargetDesc/WebAssemblyInstPrinter.cpp | 17 +++++----- .../MCTargetDesc/WebAssemblyInstPrinter.h | 2 +- .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 8 ++--- .../WebAssembly/WebAssemblyCFGStackify.cpp | 10 +++--- .../WebAssembly/WebAssemblyISelDAGToDAG.cpp | 2 +- .../WebAssembly/WebAssemblyInstrControl.td | 34 ++++++++++++------- .../WebAssembly/WebAssemblyLateEHPrepare.cpp | 2 +- .../WebAssembly/cfg-stackify-eh-legacy.mir | 23 +++++++------ .../CodeGen/WebAssembly/exception-legacy.mir | 4 +-- .../CodeGen/WebAssembly/function-info.mir | 4 +-- .../WebAssemblyExceptionInfoTest.cpp | 16 ++++----- 11 files changed, 66 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index bf6d6dce1f8ac..b85ed1d93593b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -166,15 +166,15 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address, } return; - case WebAssembly::CATCH: - case WebAssembly::CATCH_S: - case WebAssembly::CATCH_ALL: - case WebAssembly::CATCH_ALL_S: + case WebAssembly::CATCH_LEGACY: + case WebAssembly::CATCH_LEGACY_S: + case WebAssembly::CATCH_ALL_LEGACY: + case WebAssembly::CATCH_ALL_LEGACY_S: // There can be multiple catch instructions for one try instruction, so // we print a label only for the first 'catch' label. if (EHInstStack.empty()) { printAnnotation(OS, "try-catch mismatch!"); - } else if (EHInstStack.back() == CATCH_ALL) { + } else if (EHInstStack.back() == CATCH_ALL_LEGACY) { printAnnotation(OS, "catch/catch_all cannot occur after catch_all"); } else if (EHInstStack.back() == TRY) { if (TryStack.empty()) { @@ -183,10 +183,11 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address, printAnnotation(OS, "catch" + utostr(TryStack.pop_back_val()) + ':'); } EHInstStack.pop_back(); - if (Opc == WebAssembly::CATCH || Opc == WebAssembly::CATCH_S) { - EHInstStack.push_back(CATCH); + if (Opc == WebAssembly::CATCH_LEGACY || + Opc == WebAssembly::CATCH_LEGACY_S) { + EHInstStack.push_back(CATCH_LEGACY); } else { - EHInstStack.push_back(CATCH_ALL); + EHInstStack.push_back(CATCH_ALL_LEGACY); } } return; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h index f3c0124fd7f1f..8fd54d1640905 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h @@ -28,7 +28,7 @@ class WebAssemblyInstPrinter final : public MCInstPrinter { SmallVector, 4> ControlFlowStack; SmallVector TryStack; - enum EHInstKind { TRY, CATCH, CATCH_ALL }; + enum EHInstKind { TRY, CATCH_LEGACY, CATCH_ALL_LEGACY }; SmallVector EHInstStack; public: diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index eb3087dafed2a..00f15e1db5e13 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -470,10 +470,10 @@ inline bool isMarker(unsigned Opc) { inline bool isCatch(unsigned Opc) { switch (Opc) { - case WebAssembly::CATCH: - case WebAssembly::CATCH_S: - case WebAssembly::CATCH_ALL: - case WebAssembly::CATCH_ALL_S: + case WebAssembly::CATCH_LEGACY: + case WebAssembly::CATCH_LEGACY_S: + case WebAssembly::CATCH_ALL_LEGACY: + case WebAssembly::CATCH_ALL_LEGACY_S: return true; default: return false; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 6fd882f62f3f0..3362ea5316e45 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -1305,7 +1305,7 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) { // catch_all always catches an exception, so we don't need to do // anything - if (MI.getOpcode() == WebAssembly::CATCH_ALL) { + if (MI.getOpcode() == WebAssembly::CATCH_ALL_LEGACY) { } // This can happen when the unwind dest was removed during the @@ -1448,8 +1448,8 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) { case WebAssembly::DELEGATE: updateScopeTops(EndToBegin[&MI]->getParent(), &MBB); break; - case WebAssembly::CATCH: - case WebAssembly::CATCH_ALL: + case WebAssembly::CATCH_LEGACY: + case WebAssembly::CATCH_ALL_LEGACY: updateScopeTops(EHPadToTry[&MBB]->getParent(), &MBB); break; } @@ -1698,8 +1698,8 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { Stack.push_back(std::make_pair(EndToBegin[&MI]->getParent(), &MI)); break; - case WebAssembly::CATCH: - case WebAssembly::CATCH_ALL: + case WebAssembly::CATCH_LEGACY: + case WebAssembly::CATCH_ALL_LEGACY: EHPadStack.pop_back(); break; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 0f06f54f219f9..60c5e18fbb0cd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -212,7 +212,7 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { int Tag = Node->getConstantOperandVal(2); SDValue SymNode = getTagSymNode(Tag, CurDAG); MachineSDNode *Catch = - CurDAG->getMachineNode(WebAssembly::CATCH, DL, + CurDAG->getMachineNode(WebAssembly::CATCH_LEGACY, DL, { PtrVT, // exception pointer MVT::Other // outchain type diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index be6547007aaf7..dd40015577fd7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -127,11 +127,27 @@ defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>; let Predicates = [HasExceptionHandling] in { -// Throwing an exception: throw / rethrow +// Throwing an exception: throw let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { defm THROW : I<(outs), (ins tag_op:$tag, variable_ops), (outs), (ins tag_op:$tag), [], "throw \t$tag", "throw \t$tag", 0x08>; +} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 + +// Pseudo instructions: cleanupret / catchret +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isPseudo = 1, isEHScopeReturn = 1 in { + defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>; + defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from), + [(catchret bb:$dst, bb:$from)], "catchret", 0>; +} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + // isPseudo = 1, isEHScopeReturn = 1 + +// Below are instructions from the legacy EH proposal. Could be deprecated if +// usage gets low enough. + +// Rethrowing an exception: rethrow +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>; } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 // The depth argument will be computed in CFGStackify. We set it to 0 here for @@ -147,22 +163,14 @@ defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>; // Catching an exception: catch / catch_all let hasCtrlDep = 1, hasSideEffects = 1 in { let variadicOpsAreDefs = 1 in -defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops), - (outs), (ins tag_op:$tag), [], - "catch", "catch \t$tag", 0x07>; -defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x19>; +defm CATCH_LEGACY : I<(outs), (ins tag_op:$tag, variable_ops), + (outs), (ins tag_op:$tag), [], + "catch", "catch \t$tag", 0x07>; +defm CATCH_ALL_LEGACY : NRI<(outs), (ins), [], "catch_all", 0x19>; } // Delegating an exception: delegate let isTerminator = 1, hasCtrlDep = 1, hasSideEffects = 1 in defm DELEGATE : NRI<(outs), (ins bb_op:$dst), [], "delegate \t $dst", 0x18>; -// Pseudo instructions: cleanupret / catchret -let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, - isPseudo = 1, isEHScopeReturn = 1 in { - defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>; - defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from), - [(catchret bb:$dst, bb:$from)], "catchret", 0>; -} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, - // isPseudo = 1, isEHScopeReturn = 1 } // Predicates = [HasExceptionHandling] diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index 94037b9ab189d..f0c205cdb6aeb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -216,7 +216,7 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) { Changed = true; BuildMI(MBB, InsertPos, InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(), - TII.get(WebAssembly::CATCH_ALL)); + TII.get(WebAssembly::CATCH_ALL_LEGACY)); } } return Changed; diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir index 0386410d1b612..c37a82fe80826 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir @@ -31,22 +31,23 @@ body: | bb.1 (landing-pad): successors: %bb.2 ; CHECK: bb.1 (landing-pad): - ; CHECK: CATCH + ; CHECK: CATCH_LEGACY ; CHECK: TRY - ; This RETHROW rethrows the exception caught by this BB's CATCH, but after - ; CFGStackify a TRY is placed between the CATCH and this RETHROW, so after - ; CFGStackify its immediate argument should become not 0, but 1. + ; This RETHROW rethrows the exception caught by this BB's CATCH_LEGACY, but + ; after CFGStackify a TRY is placed between the CATCH_LEGACY and this + ; RETHROW, so after CFGStackify its immediate argument should become not 0, + ; but 1. ; CHECK: RETHROW 1 EH_LABEL - %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments + %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments RETHROW 0, implicit-def dead $arguments bb.2 (landing-pad): ; CHECK: bb.2 (landing-pad): - ; CHECK: CATCH + ; CHECK: CATCH_LEGACY ; CHECK: RETHROW 0 EH_LABEL - %1:i32 = CATCH &__cpp_exception, implicit-def dead $arguments + %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments RETHROW 0, implicit-def dead $arguments bb.3: @@ -78,13 +79,13 @@ body: | ; CHECK: CALL @foo ; CHECK: DELEGATE ; CHECK: RETURN - ; CHECK: CATCH + ; CHECK: CATCH_LEGACY ;; This TRY should have the return type i32 (127) ; CHECK: TRY 127 ; CHECK: RETHROW ; CHECK: DELEGATE ; CHECK: END_TRY - ; CHECK: CATCH + ; CHECK: CATCH_LEGACY ; CHECK: RETHROW ; CHECK: END_TRY bb.0: @@ -105,11 +106,11 @@ body: | bb.3 (landing-pad): EH_LABEL - %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments + %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments RETHROW 0, implicit-def dead $arguments bb.4 (landing-pad): EH_LABEL - %1:i32 = CATCH &__cpp_exception, implicit-def dead $arguments + %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments RETHROW 0, implicit-def dead $arguments ... diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir index 895e8d8864ea2..f9eb40bb03cc7 100644 --- a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir +++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir @@ -42,13 +42,13 @@ body: | bb.1 (landing-pad): ; predecessors: %bb.0 successors: %bb.2 - ; CATCH_ALL should be after EH_LABELs in the beginning of an EH pad. + ; CATCH_ALL_LEGACY should be after EH_LABELs in the beginning of an EH pad. ; (Sometimes there are multiple EH_LABELs in an EH pad. This test tests ; that.) GLOBAL_SET should follow right after that. ; CHECK: bb.1 ; CHECK: EH_LABEL ; CHECK: EH_LABEL - ; CHECK-NEXT: CATCH_ALL + ; CHECK-NEXT: CATCH_ALL_LEGACY ; CHECK-NEXT: GLOBAL_SET_I32 EH_LABEL EH_LABEL diff --git a/llvm/test/CodeGen/WebAssembly/function-info.mir b/llvm/test/CodeGen/WebAssembly/function-info.mir index 2971d234c9b2d..d241d59b67a39 100644 --- a/llvm/test/CodeGen/WebAssembly/function-info.mir +++ b/llvm/test/CodeGen/WebAssembly/function-info.mir @@ -61,12 +61,12 @@ body: | bb.2 (landing-pad): successors: %bb.1, %bb.3 - %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments + %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments CALL @foo, implicit-def dead $arguments, implicit $sp32, implicit $sp64, implicit-def dead $arguments, implicit $sp32, implicit $sp64 BR %bb.1, implicit-def $arguments bb.3 (landing-pad): - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def $arguments ... diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp index 55caaf5d13b6c..073beb9446ffb 100644 --- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp +++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp @@ -101,14 +101,14 @@ body: | ; predecessors: %bb.0 successors: %bb.3, %bb.9 liveins: $value_stack - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def dead $arguments bb.3 (landing-pad): ; predecessors: %bb.2 successors: %bb.4, %bb.6 liveins: $value_stack - %1:i32 = CATCH &__cpp_exception, implicit-def $arguments + %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments BR_IF %bb.4, %58:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack BR %bb.6, implicit-def $arguments @@ -139,13 +139,13 @@ body: | ; predecessors: %bb.4 successors: %bb.9 liveins: $value_stack - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def dead $arguments bb.9 (landing-pad): ; predecessors: %bb.2, %bb.6, %bb.8 liveins: $value_stack - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def dead $arguments bb.10: @@ -257,7 +257,7 @@ body: | ; predecessors: %bb.0 successors: %bb.2, %bb.8 liveins: $value_stack - %0:i32 = CATCH &__cpp_exception, implicit-def $arguments + %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments BR_IF %bb.2, %32:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack BR %bb.8, implicit-def $arguments @@ -271,7 +271,7 @@ body: | ; predecessors: %bb.2 successors: %bb.4, %bb.6 liveins: $value_stack - %1:i32 = CATCH &__cpp_exception, implicit-def $arguments + %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments BR_IF %bb.4, %43:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack BR %bb.6, implicit-def $arguments @@ -313,13 +313,13 @@ body: | ; predecessors: %bb.4 successors: %bb.11 liveins: $value_stack - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def dead $arguments bb.11 (landing-pad): ; predecessors: %bb.2, %bb.6, %bb.10 liveins: $value_stack - CATCH_ALL implicit-def $arguments + CATCH_ALL_LEGACY implicit-def $arguments RETHROW 0, implicit-def dead $arguments bb.12: From 23457964392d00fc872fa6021763859024fb38da Mon Sep 17 00:00:00 2001 From: ziqingluo-90 Date: Wed, 4 Sep 2024 16:26:04 -0700 Subject: [PATCH 168/425] Revert "[-Wunsafe-buffer-usage] Warning Libc functions (#101583)" This reverts commit 0fffdeb5f46078ddcc61e112cd38856b1165f050. Will re-land this commit soon with a way to opt-out --- .../Analysis/Analyses/UnsafeBufferUsage.h | 15 - .../Analyses/UnsafeBufferUsageGadgets.def | 1 - .../clang/Basic/DiagnosticSemaKinds.td | 7 - clang/lib/Analysis/UnsafeBufferUsage.cpp | 513 +----------------- clang/lib/Sema/AnalysisBasedWarnings.cpp | 14 - ...-usage-libc-functions-inline-namespace.cpp | 60 -- ...arn-unsafe-buffer-usage-libc-functions.cpp | 106 ---- ...n-unsafe-buffer-usage-test-unreachable.cpp | 4 +- 8 files changed, 4 insertions(+), 716 deletions(-) delete mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp delete mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h index aa2c01ad10d45..228b4ae1e3e11 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h @@ -15,7 +15,6 @@ #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H #include "clang/AST/Decl.h" -#include "clang/AST/Expr.h" #include "clang/AST/Stmt.h" #include "clang/Basic/SourceLocation.h" #include "llvm/Support/Debug.h" @@ -107,20 +106,6 @@ class UnsafeBufferUsageHandler { virtual void handleUnsafeOperation(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) = 0; - /// Invoked when a call to an unsafe libc function is found. - /// \param PrintfInfo - /// is 0 if the callee function is not a member of the printf family; - /// is 1 if the callee is `sprintf`; - /// is 2 if arguments of the call have `__size_by` relation but are not in a - /// safe pattern; - /// is 3 if string arguments do not guarantee null-termination - /// is 4 if the callee takes va_list - /// \param UnsafeArg one of the actual arguments that is unsafe, non-null - /// only when `2 <= PrintfInfo <= 3` - virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, - ASTContext &Ctx, - const Expr *UnsafeArg = nullptr) = 0; - /// Invoked when an unsafe operation with a std container is found. virtual void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def index ac01b285ae833..242ad763ba62b 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def @@ -38,7 +38,6 @@ WARNING_GADGET(PointerArithmetic) WARNING_GADGET(UnsafeBufferUsageAttr) WARNING_GADGET(UnsafeBufferUsageCtorAttr) WARNING_GADGET(DataInvocation) -WARNING_GADGET(UnsafeLibcFunctionCall) WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)` FIXABLE_GADGET(ULCArraySubscript) // `DRE[any]` in an Unspecified Lvalue Context FIXABLE_GADGET(DerefSimplePtrArithFixable) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 35f68f51dfb35..dcb49d8a67604 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12412,13 +12412,6 @@ def warn_unsafe_buffer_operation : Warning< "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|" "field %1 prone to unsafe buffer manipulation}0">, InGroup, DefaultIgnore; -def warn_unsafe_buffer_libc_call : Warning< - "function %0 is unsafe">, - InGroup, DefaultIgnore; -def note_unsafe_buffer_printf_call : Note< - "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match" - "|string argument is not guaranteed to be null-terminated" - "|'va_list' is unsafe}0">; def note_unsafe_buffer_operation : Note< "used%select{| in pointer arithmetic| in buffer access}0 here">; def note_unsafe_buffer_variable_fixit_group : Note< diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index f0d072643f8ff..da7446913f7c8 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -10,12 +10,12 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" -#include "clang/AST/FormatString.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Stmt.h" #include "clang/AST/StmtVisitor.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" +#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" @@ -443,426 +443,6 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) { return false; } -AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) { - return Node.getNumArgs() == Num; -} - -namespace libc_func_matchers { -// Under `libc_func_matchers`, define a set of matchers that match unsafe -// functions in libc and unsafe calls to them. - -// A tiny parser to strip off common prefix and suffix of libc function names -// in real code. -// -// Given a function name, `matchName` returns `CoreName` according to the -// following grammar: -// -// LibcName := CoreName | CoreName + "_s" -// MatchingName := "__builtin_" + LibcName | -// "__builtin___" + LibcName + "_chk" | -// "__asan_" + LibcName -// -struct LibcFunNamePrefixSuffixParser { - StringRef matchName(StringRef FunName, bool isBuiltin) { - // Try to match __builtin_: - if (isBuiltin && FunName.starts_with("__builtin_")) - // Then either it is __builtin_LibcName or __builtin___LibcName_chk or - // no match: - return matchLibcNameOrBuiltinChk( - FunName.drop_front(10 /* truncate "__builtin_" */)); - // Try to match __asan_: - if (FunName.starts_with("__asan_")) - return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */)); - return matchLibcName(FunName); - } - - // Parameter `Name` is the substring after stripping off the prefix - // "__builtin_". - StringRef matchLibcNameOrBuiltinChk(StringRef Name) { - if (Name.starts_with("__") && Name.ends_with("_chk")) - return matchLibcName( - Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */); - return matchLibcName(Name); - } - - StringRef matchLibcName(StringRef Name) { - if (Name.ends_with("_s")) - return Name.drop_back(2 /* truncate "_s" */); - return Name; - } -}; - -// A pointer type expression is known to be null-terminated, if it has the -// form: E.c_str(), for any expression E of `std::string` type. -static bool isNullTermPointer(const Expr *Ptr) { - if (isa(Ptr->IgnoreParenImpCasts())) - return true; - if (isa(Ptr->IgnoreParenImpCasts())) - return true; - if (auto *MCE = dyn_cast(Ptr->IgnoreParenImpCasts())) { - const CXXMethodDecl *MD = MCE->getMethodDecl(); - const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl(); - - if (MD && RD && RD->isInStdNamespace()) - if (MD->getName() == "c_str" && RD->getName() == "basic_string") - return true; - } - return false; -} - -// Return true iff at least one of following cases holds: -// 1. Format string is a literal and there is an unsafe pointer argument -// corresponding to an `s` specifier; -// 2. Format string is not a literal and there is least an unsafe pointer -// argument (including the formatter argument). -// -// `UnsafeArg` is the output argument that will be set only if this function -// returns true. -static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg, - const unsigned FmtArgIdx, ASTContext &Ctx, - bool isKprintf = false) { - class StringFormatStringHandler - : public analyze_format_string::FormatStringHandler { - const CallExpr *Call; - unsigned FmtArgIdx; - const Expr *&UnsafeArg; - - public: - StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx, - const Expr *&UnsafeArg) - : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {} - - bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen, - const TargetInfo &Target) override { - if (FS.getConversionSpecifier().getKind() == - analyze_printf::PrintfConversionSpecifier::sArg) { - unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx; - - if (0 < ArgIdx && ArgIdx < Call->getNumArgs()) - if (!isNullTermPointer(Call->getArg(ArgIdx))) { - UnsafeArg = Call->getArg(ArgIdx); // output - // returning false stops parsing immediately - return false; - } - } - return true; // continue parsing - } - }; - - const Expr *Fmt = Call->getArg(FmtArgIdx); - - if (auto *SL = dyn_cast(Fmt->IgnoreParenImpCasts())) { - StringRef FmtStr = SL->getString(); - StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg); - - return analyze_format_string::ParsePrintfString( - Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(), - Ctx.getTargetInfo(), isKprintf); - } - // If format is not a string literal, we cannot analyze the format string. - // In this case, this call is considered unsafe if at least one argument - // (including the format argument) is unsafe pointer. - return llvm::any_of( - llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()), - [&UnsafeArg](const Expr *Arg) -> bool { - if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) { - UnsafeArg = Arg; - return true; - } - return false; - }); -} - -// Matches a FunctionDecl node such that -// 1. It's name, after stripping off predefined prefix and suffix, is -// `CoreName`; and -// 2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which -// is a set of libc function names. -// -// Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`. -// The notation `CoreName[str/wcs]` means a new name obtained from replace -// string "wcs" with "str" in `CoreName`. -AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) { - static std::unique_ptr> PredefinedNames = nullptr; - if (!PredefinedNames) - PredefinedNames = - std::make_unique, std::set>({ - // numeric conversion: - "atof", - "atoi", - "atol", - "atoll", - "strtol", - "strtoll", - "strtoul", - "strtoull", - "strtof", - "strtod", - "strtold", - "strtoimax", - "strtoumax", - // "strfromf", "strfromd", "strfroml", // C23? - // string manipulation: - "strcpy", - "strncpy", - "strlcpy", - "strcat", - "strncat", - "strlcat", - "strxfrm", - "strdup", - "strndup", - // string examination: - "strlen", - "strnlen", - "strcmp", - "strncmp", - "stricmp", - "strcasecmp", - "strcoll", - "strchr", - "strrchr", - "strspn", - "strcspn", - "strpbrk", - "strstr", - "strtok", - // "mem-" functions - "memchr", - "wmemchr", - "memcmp", - "wmemcmp", - "memcpy", - "memccpy", - "mempcpy", - "wmemcpy", - "memmove", - "wmemmove", - "memset", - "wmemset", - // IO: - "fread", - "fwrite", - "fgets", - "fgetws", - "gets", - "fputs", - "fputws", - "puts", - // others - "strerror_s", - "strerror_r", - "bcopy", - "bzero", - "bsearch", - "qsort", - }); - - auto *II = Node.getIdentifier(); - - if (!II) - return false; - - StringRef Name = LibcFunNamePrefixSuffixParser().matchName( - II->getName(), Node.getBuiltinID()); - - // Match predefined names: - if (PredefinedNames->find(Name) != PredefinedNames->end()) - return true; - - std::string NameWCS = Name.str(); - size_t WcsPos = NameWCS.find("wcs"); - - while (WcsPos != std::string::npos) { - NameWCS[WcsPos++] = 's'; - NameWCS[WcsPos++] = 't'; - NameWCS[WcsPos++] = 'r'; - WcsPos = NameWCS.find("wcs", WcsPos); - } - if (PredefinedNames->find(NameWCS) != PredefinedNames->end()) - return true; - // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They - // all should end with "scanf"): - return Name.ends_with("scanf"); -} - -// Match a call to one of the `v*printf` functions taking `va_list`. We cannot -// check safety for these functions so they should be changed to their -// non-va_list versions. -AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) { - auto *II = Node.getIdentifier(); - - if (!II) - return false; - - StringRef Name = LibcFunNamePrefixSuffixParser().matchName( - II->getName(), Node.getBuiltinID()); - - if (!Name.ends_with("printf")) - return false; // neither printf nor scanf - return Name.starts_with("v"); -} - -// Matches a call to one of the `sprintf` functions as they are always unsafe -// and should be changed to `snprintf`. -AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) { - auto *II = Node.getIdentifier(); - - if (!II) - return false; - - StringRef Name = LibcFunNamePrefixSuffixParser().matchName( - II->getName(), Node.getBuiltinID()); - - if (!Name.ends_with("printf") || - // Let `isUnsafeVaListPrintfFunc` check for cases with va-list: - Name.starts_with("v")) - return false; - - StringRef Prefix = Name.drop_back(6); - - if (Prefix.ends_with("w")) - Prefix = Prefix.drop_back(1); - return Prefix == "s"; -} - -// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide -// character versions. Calls to these functions can be safe if their arguments -// are carefully made safe. -AST_MATCHER(FunctionDecl, isNormalPrintfFunc) { - auto *II = Node.getIdentifier(); - - if (!II) - return false; - - StringRef Name = LibcFunNamePrefixSuffixParser().matchName( - II->getName(), Node.getBuiltinID()); - - if (!Name.ends_with("printf") || Name.starts_with("v")) - return false; - - StringRef Prefix = Name.drop_back(6); - - if (Prefix.ends_with("w")) - Prefix = Prefix.drop_back(1); - - return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn"; -} - -// This matcher requires that it is known that the callee `isNormalPrintf`. -// Then if the format string is a string literal, this matcher matches when at -// least one string argument is unsafe. If the format is not a string literal, -// this matcher matches when at least one pointer type argument is unsafe. -AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg, - clang::ast_matchers::internal::Matcher, - UnsafeStringArgMatcher) { - // Determine what printf it is: - const Expr *FirstArg = Node.getArg(0); - ASTContext &Ctx = Finder->getASTContext(); - - if (isa(FirstArg->IgnoreParenImpCasts())) { - // It is a printf/kprintf. And, the format is a string literal: - bool isKprintf = false; - const Expr *UnsafeArg; - - if (auto *Callee = Node.getDirectCallee()) - if (auto *II = Node.getDirectCallee()->getIdentifier()) - isKprintf = II->getName() == "kprintf"; - if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf)) - return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); - return false; - } - - QualType PtrTy = FirstArg->getType(); - - assert(PtrTy->isPointerType()); - - QualType PteTy = (cast(PtrTy))->getPointeeType(); - - if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext, - there can't be any file pointer then */ - && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) { - // It is a fprintf: - const Expr *UnsafeArg; - - if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false)) - return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); - return false; - } - - const Expr *SecondArg = Node.getArg(1); - - if (SecondArg->getType()->isIntegerType()) { - // It is a snprintf: - const Expr *UnsafeArg; - - if (unsigned UnsafeArgIdx = - hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false)) - return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); - return false; - } - // It is printf but the format string is passed by pointer. The only thing we - // can do is to require all pointers to be null-terminated: - for (auto Arg : Node.arguments()) - if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) - if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder)) - return true; - return false; -} - -// This matcher requires that it is known that the callee `isNormalPrintf`. -// Then it matches if the first two arguments of the call is a pointer and an -// integer and they are not in a safe pattern. -// -// For the first two arguments: `ptr` and `size`, they are safe if in the -// following patterns: -// ptr := DRE.data(); -// size:= DRE.size()/DRE.size_bytes() -// And DRE is a hardened container or view. -AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) { - if (Node.getNumArgs() < 3) - return false; // not an snprintf call - - const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1); - - if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType()) - return false; // not an snprintf call - - static StringRef SizedObjs[] = {"span", "array", "vector", - "basic_string_view", "basic_string"}; - Buf = Buf->IgnoreParenImpCasts(); - Size = Size->IgnoreParenImpCasts(); - if (auto *MCEPtr = dyn_cast(Buf)) - if (auto *MCESize = dyn_cast(Size)) { - auto *DREOfPtr = dyn_cast( - MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts()); - auto *DREOfSize = dyn_cast( - MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts()); - - if (!DREOfPtr || !DREOfSize) - return true; // not in safe pattern - if (DREOfPtr->getDecl() != DREOfSize->getDecl()) - return true; // not in safe pattern - if (MCEPtr->getMethodDecl()->getName() != "data") - return true; // not in safe pattern - - if (MCESize->getMethodDecl()->getName() == "size_bytes" || - // Note here the pointer must be a pointer-to-char type unless there - // is explicit casting. If there is explicit casting, this branch - // is unreachable. Thus, at this branch "size" and "size_bytes" are - // equivalent as the pointer is a char pointer: - MCESize->getMethodDecl()->getName() == "size") - for (StringRef SizedObj : SizedObjs) - if (MCEPtr->getRecordDecl()->isInStdNamespace() && - MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() == - SizedObj) - return false; // It is in fact safe - } - return true; // ptr and size are not in safe pattern -} -} // namespace libc_func_matchers } // namespace clang::ast_matchers namespace { @@ -1450,97 +1030,6 @@ class DataInvocationGadget : public WarningGadget { DeclUseList getClaimedVarUseSites() const override { return {}; } }; -class UnsafeLibcFunctionCallGadget : public WarningGadget { - const CallExpr *const Call; - const Expr *UnsafeArg = nullptr; - constexpr static const char *const Tag = "UnsafeLibcFunctionCall"; - // Extra tags for additional information: - constexpr static const char *const UnsafeSprintfTag = - "UnsafeLibcFunctionCall_sprintf"; - constexpr static const char *const UnsafeSizedByTag = - "UnsafeLibcFunctionCall_sized_by"; - constexpr static const char *const UnsafeStringTag = - "UnsafeLibcFunctionCall_string"; - constexpr static const char *const UnsafeVaListTag = - "UnsafeLibcFunctionCall_va_list"; - - enum UnsafeKind { - OTHERS = 0, // no specific information, the callee function is unsafe - SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead. - SIZED_BY = - 2, // the first two arguments of `snprintf` function have - // "__sized_by" relation but they do not conform to safe patterns - STRING = 3, // an argument is a pointer-to-char-as-string but does not - // guarantee null-termination - VA_LIST = 4, // one of the `-printf`s function that take va_list, which is - // considered unsafe as it is not compile-time check - } WarnedFunKind = OTHERS; - -public: - UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result) - : WarningGadget(Kind::UnsafeLibcFunctionCall), - Call(Result.Nodes.getNodeAs(Tag)) { - if (Result.Nodes.getNodeAs(UnsafeSprintfTag)) - WarnedFunKind = SPRINTF; - else if (auto *E = Result.Nodes.getNodeAs(UnsafeStringTag)) { - WarnedFunKind = STRING; - UnsafeArg = E; - } else if (Result.Nodes.getNodeAs(UnsafeSizedByTag)) { - WarnedFunKind = SIZED_BY; - UnsafeArg = Call->getArg(0); - } else if (Result.Nodes.getNodeAs(UnsafeVaListTag)) - WarnedFunKind = VA_LIST; - } - - static Matcher matcher() { - return stmt(anyOf( - callExpr( - callee(functionDecl(anyOf( - // Match a predefined unsafe libc - // function: - functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()), - // Match a call to one of the `v*printf` functions - // taking va-list, which cannot be checked at - // compile-time: - functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc()) - .bind(UnsafeVaListTag), - // Match a call to a `sprintf` function, which is never - // safe: - functionDecl(libc_func_matchers::isUnsafeSprintfFunc()) - .bind(UnsafeSprintfTag)))), - // (unless the call has a sole string literal argument): - unless( - allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))), - - // The following two cases require checking against actual - // arguments of the call: - - // Match a call to an `snprintf` function. And first two - // arguments of the call (that describe a buffer) are not in - // safe patterns: - callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), - libc_func_matchers::hasUnsafeSnprintfBuffer()) - .bind(UnsafeSizedByTag), - // Match a call to a `printf` function, which can be safe if - // all arguments are null-terminated: - callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), - libc_func_matchers::hasUnsafePrintfStringArg( - expr().bind(UnsafeStringTag))))); - } - - const Stmt *getBaseStmt() const { return Call; } - - SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); } - - void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler, - bool IsRelatedToDecl, - ASTContext &Ctx) const override { - Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg); - } - - DeclUseList getClaimedVarUseSites() const override { return {}; } -}; - // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue // Context (see `isInUnspecifiedLvalueContext`). // Note here `[]` is the built-in subscript operator. diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 72078ae1534b0..e6ce89dc7ec40 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2304,20 +2304,6 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { } } - void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, - ASTContext &Ctx, - const Expr *UnsafeArg = nullptr) override { - S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call) - << Call->getDirectCallee() // We've checked there is a direct callee - << Call->getSourceRange(); - if (PrintfInfo > 0) { - SourceRange R = - UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange(); - S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call) - << PrintfInfo << R; - } - } - void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) override { diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp deleted file mode 100644 index 2bd12db93fd52..0000000000000 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ -// RUN: -verify %s - -namespace std { - inline namespace __1 { - template< class InputIt, class OutputIt > - OutputIt copy( InputIt first, InputIt last, - OutputIt d_first ); - - struct iterator{}; - template - struct span { - T * ptr; - T * data(); - unsigned size_bytes(); - unsigned size(); - iterator begin() const noexcept; - iterator end() const noexcept; - }; - - template - struct basic_string { - T* p; - T *c_str(); - T *data(); - unsigned size_bytes(); - }; - - typedef basic_string string; - typedef basic_string wstring; - - // C function under std: - void memcpy(); - void strcpy(); - int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); - } -} - -void f(char * p, char * q, std::span s) { - std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} - std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} - std::__1::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} - std::__1::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} - - /* Test printfs */ - std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} - std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} - std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn - std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn -} - -void v(std::string s1) { - std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn - std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn -} - -void g(char *begin, char *end, char *p, std::span s) { - std::copy(begin, end, p); // no warn - std::copy(s.begin(), s.end(), s.begin()); // no warn -} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp deleted file mode 100644 index 1a29654f660c9..0000000000000 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ -// RUN: -verify %s - -typedef struct {} FILE; -void memcpy(); -void __asan_memcpy(); -void strcpy(); -void strcpy_s(); -void wcscpy_s(); -unsigned strlen( const char* str ); -int fprintf( FILE* stream, const char* format, ... ); -int printf( const char* format, ... ); -int sprintf( char* buffer, const char* format, ... ); -int swprintf( char* buffer, const char* format, ... ); -int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); -int snwprintf( char* buffer, unsigned buf_size, const char* format, ... ); -int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... ); -int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... ); -int sscanf_s(const char * buffer, const char * format, ...); -int sscanf(const char * buffer, const char * format, ... ); - -namespace std { - template< class InputIt, class OutputIt > - OutputIt copy( InputIt first, InputIt last, - OutputIt d_first ); - - struct iterator{}; - template - struct span { - T * ptr; - T * data(); - unsigned size_bytes(); - unsigned size(); - iterator begin() const noexcept; - iterator end() const noexcept; - }; - - template - struct basic_string { - T* p; - T *c_str(); - T *data(); - unsigned size_bytes(); - }; - - typedef basic_string string; - typedef basic_string wstring; - - // C function under std: - void memcpy(); - void strcpy(); -} - -void f(char * p, char * q, std::span s, std::span s2) { - memcpy(); // expected-warning{{function 'memcpy' is unsafe}} - std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} - __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}} - __builtin___memcpy_chk(p, q, 8, 64); // expected-warning{{function '__builtin___memcpy_chk' is unsafe}} - __asan_memcpy(); // expected-warning{{function '__asan_memcpy' is unsafe}} - strcpy(); // expected-warning{{function 'strcpy' is unsafe}} - std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} - strcpy_s(); // expected-warning{{function 'strcpy_s' is unsafe}} - wcscpy_s(); // expected-warning{{function 'wcscpy_s' is unsafe}} - - - /* Test printfs */ - fprintf((FILE*)p, "%s%d", p, *p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} - printf("%s%d", // expected-warning{{function 'printf' is unsafe}} - p, // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument - *p); - sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} - swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} - snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} - snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} - snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} - snwprintf_s( // expected-warning{{function 'snwprintf_s' is unsafe}} - s.data(), // expected-note{{buffer pointer and size may not match}} // note attached to the buffer - s2.size(), - "%s%d", "hello", *p); - vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}} - sscanf(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf' is unsafe}} - sscanf_s(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf_s' is unsafe}} - fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} - fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn - printf("%s%d", "hello", *p); // no warn - snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn - snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn - snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn - snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn - strlen("hello");// no warn -} - -void v(std::string s1, int *p) { - snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn - snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn - printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn - printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn - fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn - fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn -} - - -void g(char *begin, char *end, char *p, std::span s) { - std::copy(begin, end, p); // no warn - std::copy(s.begin(), s.end(), s.begin()); // no warn -} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp index 989931e41c0cc..844311c3a51a5 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp @@ -1,6 +1,8 @@ // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s +// expected-no-diagnostics + typedef unsigned __darwin_size_t; typedef __darwin_size_t size_t; #define bzero(s, n) __builtin_bzero(s, n) -void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}} +void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } From 1254259e325428c5912843aa94f6fc663a40ea1b Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 4 Sep 2024 17:03:18 -0700 Subject: [PATCH 169/425] [hwasan] Disable test with hwasan-aliasing It's likely flaky because we tag the stack, which is unsupported in this mode. --- .../test/sanitizer_common/TestCases/Posix/fork_threaded.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c index 27b67db0c0a38..5f26ba2f330bd 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c @@ -21,6 +21,9 @@ // thread. // 2. Stack poisoned by `inparent` is not poisoned in `inchild` thread. +// Stack tagging is unsupported. +// UNSUPPORTED: hwasan-aliasing + #include #include #include From ef1ef03d4c1014d41713feb0c7edc4d0e36982f4 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Wed, 4 Sep 2024 17:38:25 -0700 Subject: [PATCH 170/425] [BPF] Fix dst/val mismatch in class ATOMIC_NOFETCH (#107288) All ATOMIC_NOFETCH insns have "$dst = $val" constraints. So let us enforce "$dst = $val" having the same register type in ATOMIC_NOFETCH as well. Currently, things work since ATOMIC_NOFETCH does not have source code pattern matching. I am experimenting to introduce memory ordering to BPFInstrInfo.td file and pattern matching will be needed. Eventually, for atomic_fetch_*() insns locked insns could be generated if memory ordering is memory_order_relaxed. [1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/ --- llvm/lib/Target/BPF/BPFInstrInfo.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 6c750af5c2fd9..f7e17901c7ed5 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -790,7 +790,7 @@ let Predicates = [BPFNoALU32] in { class ATOMIC_NOFETCH : TYPE_LD_ST { From c82a5496c80747981efb8d25ad8bc4d8c6785b2e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 17:49:50 -0700 Subject: [PATCH 171/425] [RISCV] Support fixed vector VP_LOAD/STORE for bf16 and f16 without Zvfh. (#107297) This allows odd sized vector load/store to be legalized to a VP_LOAD/STORE using EVL. I changed the bf16 tests in fixed-vectors-load.ll and fixed-vectors-store.ll to use an illegal type to be consistent with the intent of these files. A legal type is already tested in fixed-vectors-load-store.ll --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 2786 ++++++----------- .../CodeGen/RISCV/rvv/fixed-vectors-load.ll | 10 +- .../CodeGen/RISCV/rvv/fixed-vectors-store.ll | 8 +- 4 files changed, 974 insertions(+), 1834 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2ea909b085a6d..a9061a05c7c67 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1317,6 +1317,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // FIXME: mload, mstore, mgather, mscatter, vp_load/store, // vp_stride_load/store, vp_gather/scatter can be hoisted to here. setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); + setOperationAction({ISD::VP_LOAD, ISD::VP_STORE}, VT, Custom); setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, @@ -1378,8 +1379,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction( {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom); - setOperationAction({ISD::VP_LOAD, ISD::VP_STORE, - ISD::EXPERIMENTAL_VP_STRIDED_LOAD, + setOperationAction({ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 56cd718536daa..d996a9c05aca4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -49,42 +49,20 @@ define void @fadd_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fadd_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fadd_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fadd_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v8, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fadd <6 x half> %a, %b @@ -173,42 +151,20 @@ define void @fsub_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fsub_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fsub_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fsub_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v8, v8, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fsub <6 x half> %a, %b @@ -297,42 +253,20 @@ define void @fmul_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmul_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmul_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmul_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v8, v8, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fmul <6 x half> %a, %b @@ -421,42 +355,20 @@ define void @fdiv_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fdiv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v8, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fdiv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v8, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fdiv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfdiv.vv v8, v8, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fdiv <6 x half> %a, %b @@ -576,115 +488,55 @@ define void @fneg_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fneg_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-RV32-NEXT: lui a3, 1048568 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: xor a4, a4, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV32-NEXT: xor a5, a5, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV32-NEXT: xor a6, a6, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: lui t0, 8 -; ZVFHMIN-RV32-NEXT: xor a7, a7, t0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a7 -; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-RV32-NEXT: xor a6, a7, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 -; ZVFHMIN-RV32-NEXT: xor a3, a6, a3 -; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fneg_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-RV64-NEXT: lui a3, 1048568 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV64-NEXT: lui a5, 8 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: xor a4, a4, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fneg_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vse16.v v8, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: flh fa4, 0(sp) +; ZVFHMIN-NEXT: flh fa3, 4(sp) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-NEXT: lui a3, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-NEXT: xor a4, a4, a3 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = fneg <6 x half> %a store <6 x half> %b, ptr %x @@ -851,9 +703,10 @@ define void @fabs_v6f16(ptr %x) { ; ZVFHMIN-RV32: # %bb.0: ; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 ; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) @@ -864,44 +717,35 @@ define void @fabs_v6f16(ptr %x) { ; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 ; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) ; ZVFHMIN-RV32-NEXT: addi a3, a3, -1 -; ZVFHMIN-RV32-NEXT: and a1, a1, a3 ; ZVFHMIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) ; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 ; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-RV32-NEXT: and a4, a4, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) ; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: and a5, a5, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: and a6, a6, a3 -; ZVFHMIN-RV32-NEXT: and a7, a7, a3 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a7 -; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-RV32-NEXT: and a6, a7, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa5 -; ZVFHMIN-RV32-NEXT: and a3, a6, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-RV32-NEXT: and a1, a1, a3 ; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV32-NEXT: ret ; @@ -909,9 +753,10 @@ define void @fabs_v6f16(ptr %x) { ; ZVFHMIN-RV64: # %bb.0: ; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 ; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) @@ -948,12 +793,9 @@ define void @fabs_v6f16(ptr %x) { ; ZVFHMIN-RV64-NEXT: and a1, a1, a3 ; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x @@ -1314,10 +1156,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32: # %bb.0: ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) @@ -1345,35 +1188,26 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; @@ -1381,10 +1215,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64: # %bb.0: ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) @@ -1429,12 +1264,9 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; @@ -1442,93 +1274,85 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32: # %bb.0: ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 16(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: lui t1, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t1, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, t1, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t0, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a7, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t2, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, t0, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t0, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, t2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t2, t1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a6, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t1, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, t1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, t1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; @@ -1536,10 +1360,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64: # %bb.0: ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) @@ -1611,12 +1436,9 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x @@ -1909,9 +1731,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFH-RV32: # %bb.0: ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) @@ -1928,36 +1751,27 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a2 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fmv.h fa5, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; @@ -1965,9 +1779,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFH-RV64: # %bb.0: ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) @@ -2002,12 +1817,9 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; @@ -2015,70 +1827,62 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFHIN-RV32: # %bb.0: ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a1, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a4, 8 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a5, a2, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a4, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, a7, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t0, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, t1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; @@ -2086,9 +1890,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFHIN-RV64: # %bb.0: ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa0 @@ -2138,12 +1943,9 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x @@ -2523,10 +2325,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32: # %bb.0: ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) @@ -2549,40 +2352,31 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 6(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa0, ft0, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa1, ft1, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa2, fa0, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 14(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; @@ -2590,10 +2384,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64: # %bb.0: ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) @@ -2638,12 +2433,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; @@ -2651,10 +2443,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32: # %bb.0: ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 28(sp) @@ -2665,87 +2458,78 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 22(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 16(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: not a2, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a4, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a4) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a5, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a5) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not t3, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a5, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui t0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, t0, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a7, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: not a7, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t0, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui t3, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t3, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a5, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a7, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, t4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, t3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, t4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t4, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and t3, t3, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t3, t4, t3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a7, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, t1, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t1, t4, t1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, t0 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, t4, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, t1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a7, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a6, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a4, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; @@ -2753,10 +2537,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64: # %bb.0: ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 ; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 28(sp) @@ -2836,12 +2621,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x @@ -3112,135 +2894,124 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: ; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e32, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 ; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; ; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: ; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v10, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa3, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa1, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: ; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e32, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a2, -1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 ; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a6, 1048568 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 ; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a5, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 ; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 @@ -3249,75 +3020,66 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; ; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: ; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle64.v v9, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v9, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v10, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 8 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a4, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a5, a4, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a2, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a6, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a3 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 24 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 4(a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <3 x half>, ptr %x %b = load <3 x float>, ptr %y @@ -3385,38 +3147,18 @@ define void @sqrt_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: sqrt_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsqrt.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: sqrt_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsqrt.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: sqrt_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfsqrt.v v8, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.sqrt.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x @@ -3508,46 +3250,22 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fma_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a2) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v8, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fma_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a2) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v8, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fma_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a2) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = load <6 x half>, ptr %z @@ -3692,128 +3410,63 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmsub_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a2) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-RV32-NEXT: lui a3, 1048568 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV32-NEXT: lui a5, 8 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: xor a4, a4, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV32-NEXT: vmv.v.x v11, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV32-NEXT: vslidedown.vi v11, v8, 4, v0.t -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmsub_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a2) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v10, (a1) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-RV64-NEXT: lui a3, 1048568 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV64-NEXT: lui a5, 8 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: xor a4, a4, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v11, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: xor a1, a1, a3 -; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v11, v8, 4, v0.t -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmsub_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a2) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vse16.v v8, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: flh fa4, 0(sp) +; ZVFHMIN-NEXT: flh fa3, 4(sp) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-NEXT: lui a3, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-NEXT: xor a4, a4, a3 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: xor a2, a2, a5 +; ZVFHMIN-NEXT: vmv.v.x v11, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: xor a2, a2, a3 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: xor a1, a1, a3 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: vslidedown.vi v11, v8, 4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = load <6 x half>, ptr %z @@ -4565,52 +4218,25 @@ define void @fadd_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fadd_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fadd_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fadd_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v9, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -4699,52 +4325,25 @@ define void @fadd_fv_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fadd_fv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v10, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fadd_fv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v10, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fadd_fv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -4833,52 +4432,25 @@ define void @fsub_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fsub_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fsub_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fsub_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v8, v9, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -4967,52 +4539,25 @@ define void @fsub_fv_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fsub_fv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v10, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fsub_fv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v10, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fsub_fv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v8, v10, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -5101,52 +4646,25 @@ define void @fmul_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmul_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmul_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmul_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v8, v9, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -5235,52 +4753,25 @@ define void @fmul_fv_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmul_fv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v10, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmul_fv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v10, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmul_fv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v8, v10, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -5369,52 +4860,25 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fdiv_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fdiv_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fdiv_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfdiv.vv v8, v9, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -5503,52 +4967,25 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fdiv_fv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfdiv.vv v8, v10, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fdiv_fv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfdiv.vv v8, v10, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fdiv_fv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfdiv.vv v8, v10, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -5642,56 +5079,27 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fma_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v10 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fma_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v10 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fma_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = insertelement <6 x half> poison, half %z, i32 0 @@ -5791,56 +5199,27 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fma_fv_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV32-NEXT: li a2, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v10 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fma_fv_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-RV64-NEXT: li a2, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v10 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fma_fv_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = insertelement <6 x half> poison, half %z, i32 0 @@ -5982,138 +5361,68 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmsub_vf_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: li a4, 192 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-RV32-NEXT: vmerge.vxm v9, v9, a3, v0 -; ZVFHMIN-RV32-NEXT: lui a1, 1048568 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV32-NEXT: lui a5, 8 -; ZVFHMIN-RV32-NEXT: xor a4, a4, a5 -; ZVFHMIN-RV32-NEXT: vmv.v.x v10, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a2 -; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a3 -; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v10, v10, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: xor a3, a3, a5 -; ZVFHMIN-RV32-NEXT: vmv.v.x v11, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: xor a1, a2, a1 -; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV32-NEXT: vslidedown.vi v11, v10, 4, v0.t -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v10, v11 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 0(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: li a4, 192 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a4 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-RV64-NEXT: vmerge.vxm v9, v9, a3, v0 -; ZVFHMIN-RV64-NEXT: lui a1, 1048568 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-RV64-NEXT: lui a5, 8 -; ZVFHMIN-RV64-NEXT: xor a4, a4, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v10, a4 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a2 -; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a3 -; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v10, v10, a4 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: xor a3, a3, a5 -; ZVFHMIN-RV64-NEXT: vmv.v.x v11, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: xor a1, a2, a1 -; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v11, v10, 4, v0.t -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v10, v11 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmadd.vv v9, v11, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmsub_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: mv a1, sp +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vse16.v v9, (a1) +; ZVFHMIN-NEXT: flh fa5, 2(sp) +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 0(sp) +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: li a4, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a4 +; ZVFHMIN-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-NEXT: flh fa5, 4(sp) +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a3, v0 +; ZVFHMIN-NEXT: lui a1, 1048568 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 6(sp) +; ZVFHMIN-NEXT: lui a5, 8 +; ZVFHMIN-NEXT: xor a4, a4, a5 +; ZVFHMIN-NEXT: vmv.v.x v10, a4 +; ZVFHMIN-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-NEXT: flh fa5, 10(sp) +; ZVFHMIN-NEXT: xor a2, a2, a1 +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a2 +; ZVFHMIN-NEXT: xor a3, a3, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: flh fa5, 8(sp) +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a3 +; ZVFHMIN-NEXT: xor a4, a4, a1 +; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a4 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 12(sp) +; ZVFHMIN-NEXT: xor a2, a2, a1 +; ZVFHMIN-NEXT: xor a3, a3, a5 +; ZVFHMIN-NEXT: vmv.v.x v11, a3 +; ZVFHMIN-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-NEXT: flh fa5, 14(sp) +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 +; ZVFHMIN-NEXT: xor a3, a3, a1 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a3 +; ZVFHMIN-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-NEXT: xor a1, a2, a1 +; ZVFHMIN-NEXT: vmv.v.i v0, 15 +; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 +; ZVFHMIN-NEXT: vslidedown.vi v11, v10, 4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v10 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = insertelement <6 x half> poison, half %z, i32 0 @@ -6715,9 +6024,10 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV32: # %bb.0: ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI116_0) @@ -6750,79 +6060,69 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_6: ; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa0, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB116_8 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_8 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_8: ; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_10 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a3, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_10: ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a4, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a4, .LBB116_12 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_12 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a4, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a6, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3 ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a6, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a6, .LBB116_14 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_14 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a5, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a5, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_14: ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 ; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a6, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a6, .LBB116_16 +; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB116_16 ; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a5, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a5, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz +; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz ; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_16: -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: ret ; @@ -6830,9 +6130,10 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV64: # %bb.0: ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI116_0) @@ -6925,13 +6226,9 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: ret ; @@ -6939,9 +6236,10 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV32: # %bb.0: ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5 @@ -6980,89 +6278,79 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_8 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_8: ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_10 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_10 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a3, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_10: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a4, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a4, .LBB116_12 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_12 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a4, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a6, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a6, .LBB116_14 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_14 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a5, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a5, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_14: ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a6, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a6, .LBB116_16 +; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_16 ; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a5, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a5, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz +; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz ; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_16: ; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV32-NEXT: ret ; @@ -7070,9 +6358,10 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV64: # %bb.0: ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 ; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5 @@ -7181,13 +6470,9 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) ; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 ; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x @@ -7320,56 +6605,27 @@ define void @ceil_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: ceil_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV32-NEXT: lui a1, 307200 -; ZVFHMIN-RV32-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV32-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV32-NEXT: fsrmi a1, 3 -; ZVFHMIN-RV32-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: fsrm a1 -; ZVFHMIN-RV32-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: ceil_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV64-NEXT: lui a1, 307200 -; ZVFHMIN-RV64-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV64-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV64-NEXT: fsrmi a1, 3 -; ZVFHMIN-RV64-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: fsrm a1 -; ZVFHMIN-RV64-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: ceil_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a1, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a1 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.ceil.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x @@ -7506,56 +6762,27 @@ define void @floor_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: floor_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV32-NEXT: lui a1, 307200 -; ZVFHMIN-RV32-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV32-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV32-NEXT: fsrmi a1, 2 -; ZVFHMIN-RV32-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: fsrm a1 -; ZVFHMIN-RV32-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: floor_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV64-NEXT: lui a1, 307200 -; ZVFHMIN-RV64-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV64-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV64-NEXT: fsrmi a1, 2 -; ZVFHMIN-RV64-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: fsrm a1 -; ZVFHMIN-RV64-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: floor_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a1, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a1 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.floor.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x @@ -7692,56 +6919,27 @@ define void @round_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: round_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV32-NEXT: lui a1, 307200 -; ZVFHMIN-RV32-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV32-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV32-NEXT: fsrmi a1, 4 -; ZVFHMIN-RV32-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: fsrm a1 -; ZVFHMIN-RV32-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV32-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v9, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: round_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV64-NEXT: lui a1, 307200 -; ZVFHMIN-RV64-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-RV64-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-RV64-NEXT: fsrmi a1, 4 -; ZVFHMIN-RV64-NEXT: vfcvt.x.f.v v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: fsrm a1 -; ZVFHMIN-RV64-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; ZVFHMIN-RV64-NEXT: vfsgnj.vv v9, v8, v9, v0.t -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v8, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: round_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a1, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a1 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.round.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x @@ -8075,56 +7273,27 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmuladd_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v10, (a2) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v8, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfadd.vv v8, v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmuladd_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v10, (a2) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v8, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfadd.vv v8, v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmuladd_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a2) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v8, v8, v11 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v8, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = load <6 x half>, ptr %z @@ -8233,56 +7402,27 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fmsub_fmuladd_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: vle16.v v10, (a2) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfmul.vv v8, v8, v11 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfsub.vv v8, v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fmsub_fmuladd_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vle16.v v10, (a2) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfmul.vv v8, v8, v11 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfsub.vv v8, v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a2) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v8, v8, v11 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v8, v8, v9 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vse16.v v9, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = load <6 x half>, ptr %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll index 19587438ea947..22cde3c36ef61 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll @@ -182,12 +182,12 @@ define <16 x i64> @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) { ret <16 x i64> %v } -define <8 x bfloat> @load_v8bf16(ptr %p) { -; CHECK-LABEL: load_v8bf16: +define <6 x bfloat> @load_v6bf16(ptr %p) { +; CHECK-LABEL: load_v6bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: ret - %x = load <8 x bfloat>, ptr %p - ret <8 x bfloat> %x + %x = load <6 x bfloat>, ptr %p + ret <6 x bfloat> %x } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll index 7f18ee44631a1..169d99abb13c2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -294,13 +294,13 @@ define void @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) { ret void } -define void @store_v8bf16(ptr %p, <8 x bfloat> %v) { -; CHECK-LABEL: store_v8bf16: +define void @store_v6bf16(ptr %p, <6 x bfloat> %v) { +; CHECK-LABEL: store_v6bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret - store <8 x bfloat> %v, ptr %p + store <6 x bfloat> %v, ptr %p ret void } From b2048de55ea934b70902864767b0cc8dfada8be0 Mon Sep 17 00:00:00 2001 From: Max Winkler Date: Wed, 4 Sep 2024 18:52:39 -0700 Subject: [PATCH 172/425] [Clang] [Driver] Support `-fjmc` for `*-windows-msvc` target in non cl driver modes (#107177) Allow `-fjmc` to be used if the target triple is targeting msvc, `*-windows-msvc`, irrelevant of the driver mode used. In general the driver mode shouldn't control the target triple. Also in our custom build system I am trying to just treat clang as clang. This is because while the `cl` driver mode emulates msvc interface quite well there are still a lot of operations that are just clang specific. The optimization modes do not map directly from msvc to clang. Warnings do not map from msvc to clang. Instead of wrapping options with `/clang:` when targeting `clang-cl.exe` it is just easier to target the clang driver always irrelevant of the target triple. --- clang/lib/Driver/ToolChains/Clang.cpp | 3 ++- clang/test/Driver/clang_f_opts.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index baac1215215b9..90a747ca58986 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4853,7 +4853,8 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T, // This controls whether or not we perform JustMyCode instrumentation. if (Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false)) { - if (TC.getTriple().isOSBinFormatELF() || D.IsCLMode()) { + if (TC.getTriple().isOSBinFormatELF() || + TC.getTriple().isWindowsMSVCEnvironment()) { if (DebugInfoKind >= llvm::codegenoptions::DebugInfoConstructor) CmdArgs.push_back("-fjmc"); else if (D.IsCLMode()) diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index d69cd199ac61d..335fa546a1388 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -600,10 +600,12 @@ // CHECK_NO_DISABLE_DIRECT-NOT: -fobjc-disable-direct-methods-for-testing // RUN: %clang -### -S -fjmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s -// RUN: %clang -### -S -fjmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN_NOT_ELF,CHECK_NOJMC %s +// RUN: %clang -### -S -fjmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s +// RUN: %clang -### -S -fjmc -g -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s +// RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s // RUN: %clang -### -S -fjmc -g -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s // RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s -// RUN: %clang -### -fjmc -g -flto -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN_NOT_ELF,CHECK_NOJMC_LTO %s +// RUN: %clang -### -fjmc -g -flto -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s // RUN: %clang -### -fjmc -g -flto -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC_LTO %s // RUN: %clang -### -fjmc -g -flto -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s // CHECK_JMC_WARN: -fjmc requires debug info. Use -g or debug options that enable debugger's stepping function; option ignored From eb2929d323c0c44f2037cf8a345ca6984ec228eb Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Wed, 4 Sep 2024 18:59:42 -0700 Subject: [PATCH 173/425] [DirectX] use DXILMetadataAnalysis to build PSVRuntimeInfo (#107101) Replace the hardcoded values for compute shader in DXContainer::addPipelineStateValidationInfo. Still missing wave size. Add preserved for previous passes so the information is not lost. Fix https://github.com/llvm/wg-hlsl/issues/51 --- .../lib/Target/DirectX/DXContainerGlobals.cpp | 29 +++++++++---- llvm/lib/Target/DirectX/DXILPrepare.cpp | 2 + .../Target/DirectX/DXILTranslateMetadata.cpp | 2 + .../DirectX/ContainerData/RuntimeInfoCS.ll | 41 +++++++++++++++++++ 4 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index d47b9c7a25b8f..aa7769899ff27 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" @@ -57,6 +58,7 @@ class DXContainerGlobals : public llvm::ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); + AU.addRequired(); } }; @@ -143,23 +145,35 @@ void DXContainerGlobals::addPipelineStateValidationInfo( SmallString<256> Data; raw_svector_ostream OS(Data); PSVRuntimeInfo PSV; - Triple TT(M.getTargetTriple()); PSV.BaseData.MinimumWaveLaneCount = 0; PSV.BaseData.MaximumWaveLaneCount = std::numeric_limits::max(); + + dxil::ModuleMetadataInfo &MMI = + getAnalysis().getModuleMetadata(); + assert(MMI.EntryPropertyVec.size() == 1 || + MMI.ShaderStage == Triple::Library); PSV.BaseData.ShaderStage = - static_cast(TT.getEnvironment() - Triple::Pixel); + static_cast(MMI.ShaderStage - Triple::Pixel); // Hardcoded values here to unblock loading the shader into D3D. // // TODO: Lots more stuff to do here! // // See issue https://github.com/llvm/llvm-project/issues/96674. - PSV.BaseData.NumThreadsX = 1; - PSV.BaseData.NumThreadsY = 1; - PSV.BaseData.NumThreadsZ = 1; - PSV.EntryName = "main"; + switch (MMI.ShaderStage) { + case Triple::Compute: + PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX; + PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY; + PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ; + break; + default: + break; + } + + if (MMI.ShaderStage != Triple::Library) + PSV.EntryName = MMI.EntryPropertyVec[0].Entry->getName(); - PSV.finalize(TT.getEnvironment()); + PSV.finalize(MMI.ShaderStage); PSV.write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); @@ -170,6 +184,7 @@ char DXContainerGlobals::ID = 0; INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals", "DXContainer Global Emitter", false, true) INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) +INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_END(DXContainerGlobals, "dxil-globals", "DXContainer Global Emitter", false, true) diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 56098864e987f..f6b7355b93625 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/IRBuilder.h" @@ -247,6 +248,7 @@ class DXILPrepareModule : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } static char ID; // Pass identification. }; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 2c6d20112060d..11cd9df1d1dc4 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -13,6 +13,7 @@ #include "DXILShaderFlags.h" #include "DirectX.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Metadata.h" @@ -103,6 +104,7 @@ class DXILTranslateMetadataLegacy : public ModulePass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); } bool runOnModule(Module &M) override { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll b/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll new file mode 100644 index 0000000000000..595e70092bb08 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC +target triple = "dxil-unknown-shadermodel6.0-compute" + +; CHECK: @dx.psv0 = private constant [80 x i8] c"{{.*}}", section "PSV0", align 4 + +define void @cs_main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="8,8,1" "hlsl.shader"="compute" } + +!dx.valver = !{!0} + +!0 = !{i32 1, i32 7} + +; DXC: - Name: PSV0 +; DXC-NEXT: Size: 80 +; DXC-NEXT: PSVInfo: +; DXC-NEXT: Version: 3 +; DXC-NEXT: ShaderStage: 5 +; DXC-NEXT: MinimumWaveLaneCount: 0 +; DXC-NEXT: MaximumWaveLaneCount: 4294967295 +; DXC-NEXT: UsesViewID: 0 +; DXC-NEXT: SigInputVectors: 0 +; DXC-NEXT: SigOutputVectors: [ 0, 0, 0, 0 ] +; DXC-NEXT: NumThreadsX: 8 +; DXC-NEXT: NumThreadsY: 8 +; DXC-NEXT: NumThreadsZ: 1 +; DXC-NEXT: EntryName: cs_main +; DXC-NEXT: ResourceStride: 24 +; DXC-NEXT: Resources: [] +; DXC-NEXT: SigInputElements: [] +; DXC-NEXT: SigOutputElements: [] +; DXC-NEXT: SigPatchOrPrimElements: [] +; DXC-NEXT: InputOutputMap: +; DXC-NEXT: - [ ] +; DXC-NEXT: - [ ] +; DXC-NEXT: - [ ] +; DXC-NEXT: - [ ] From c28b1a19aadff97b369889aee084073a181cfda8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 19:14:18 -0700 Subject: [PATCH 174/425] [LegalizeTypes][RISCV] Use SPLAT_VECTOR_PARTS to legalize splat BUILD_VECTOR (#107290) If the element type needs to be expanded, we can use SPLAT_VECTOR_PARTS if the target supports it. There's already a DAGCombine to turn BUILD_VECTOR into SPLAT_VECTOR if the target makes SPLAT_VECTOR legal, but it doesn't fire for vectors that need to be split. --- .../SelectionDAG/LegalizeTypesGeneric.cpp | 9 + .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll | 16 +- .../rvv/fixed-vectors-masked-load-int.ll | 56 ++--- .../rvv/fixed-vectors-masked-store-int.ll | 122 ++++------- .../RISCV/rvv/fixed-vectors-vadd-vp.ll | 172 +++++---------- .../RISCV/rvv/fixed-vectors-vand-vp.ll | 36 ++-- .../RISCV/rvv/fixed-vectors-vmax-vp.ll | 66 ++---- .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll | 66 ++---- .../RISCV/rvv/fixed-vectors-vmin-vp.ll | 66 ++---- .../RISCV/rvv/fixed-vectors-vminu-vp.ll | 66 ++---- .../RISCV/rvv/fixed-vectors-vsadd-vp.ll | 187 +++++------------ .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 187 +++++------------ .../RISCV/rvv/fixed-vectors-vssub-vp.ll | 195 ++++++------------ .../RISCV/rvv/fixed-vectors-vssubu-vp.ll | 195 ++++++------------ .../CodeGen/RISCV/srem-seteq-illegal-types.ll | 61 +++--- 15 files changed, 500 insertions(+), 1000 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index b402e82376276..2655e8428309d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -376,6 +376,15 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) { assert(OldVT == VecVT.getVectorElementType() && "BUILD_VECTOR operand type doesn't match vector element type!"); + if (VecVT.isInteger() && TLI.isOperationLegal(ISD::SPLAT_VECTOR, VecVT) && + TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR_PARTS, VecVT)) { + if (SDValue V = cast(N)->getSplatValue()) { + SDValue Lo, Hi; + GetExpandedOp(V, Lo, Hi); + return DAG.getNode(ISD::SPLAT_VECTOR_PARTS, dl, VecVT, Lo, Hi); + } + } + // Build a vector of twice the length out of the expanded elements. // For example <3 x i64> -> <6 x i32>. SmallVector NewElts; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index 901be442c0012..d52cbb54c4b2d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -14,9 +14,11 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) { ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: call llrintf -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -669,9 +671,11 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) { ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: call llrint -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll index ad075e4b4e198..2f20caa6eb189 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll @@ -397,43 +397,22 @@ define void @masked_load_v32i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>) define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { -; RV32-LABEL: masked_load_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vle64.v v24, (a3) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v8, v0, v16 -; RV32-NEXT: vmseq.vv v0, v24, v16 -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v16, (a1), v0.t -; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vle64.v v8, (a0), v0.t -; RV32-NEXT: vse64.v v8, (a2) -; RV32-NEXT: addi a0, a2, 128 -; RV32-NEXT: vse64.v v16, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: masked_load_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, 128 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v16, (a1) -; RV64-NEXT: vle64.v v24, (a3) -; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vmseq.vi v0, v24, 0 -; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: vle64.v v16, (a1), v0.t -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vle64.v v8, (a0), v0.t -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: addi a0, a2, 128 -; RV64-NEXT: vse64.v v16, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: masked_load_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a3, a1, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vle64.v v24, (a3) +; CHECK-NEXT: vmseq.vi v8, v16, 0 +; CHECK-NEXT: vmseq.vi v0, v24, 0 +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle64.v v16, (a1), v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: vse64.v v8, (a2) +; CHECK-NEXT: addi a0, a2, 128 +; CHECK-NEXT: vse64.v v16, (a0) +; CHECK-NEXT: ret %m = load <32 x i64>, ptr %m_ptr %mask = icmp eq <32 x i64> %m, zeroinitializer %load = call <32 x i64> @llvm.masked.load.v32i64(ptr %a, i32 8, <32 x i1> %mask, <32 x i64> undef) @@ -547,3 +526,6 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ret void } declare <256 x i8> @llvm.masked.load.v256i8(ptr, i32, <256 x i1>, <256 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index 86c28247e97ef..90690bbc8e208 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -397,87 +397,44 @@ define void @masked_store_v32i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { declare void @llvm.masked.store.v32i32.p0(<32 x i32>, ptr, i32, <32 x i1>) define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { -; RV32-LABEL: masked_store_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: sub sp, sp, a3 -; RV32-NEXT: addi a3, a2, 128 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a2) -; RV32-NEXT: vle64.v v8, (a3) -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v7, v24, v8 -; RV32-NEXT: addi a2, a0, 128 -; RV32-NEXT: vle64.v v24, (a2) -; RV32-NEXT: vle64.v v16, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmseq.vv v0, v16, v8 -; RV32-NEXT: addi a0, a1, 128 -; RV32-NEXT: vse64.v v24, (a0), v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vse64.v v8, (a1), v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: masked_store_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 -; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a2) -; RV64-NEXT: addi a2, a2, 128 -; RV64-NEXT: vle64.v v16, (a2) -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: vmseq.vi v0, v8, 0 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vse64.v v24, (a1), v0.t -; RV64-NEXT: addi a0, a1, 128 -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vse64.v v8, (a0), v0.t -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: masked_store_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: sub sp, sp, a3 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a2) +; CHECK-NEXT: addi a2, a2, 128 +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmseq.vi v0, v8, 0 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmseq.vi v8, v16, 0 +; CHECK-NEXT: vse64.v v24, (a1), v0.t +; CHECK-NEXT: addi a0, a1, 128 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %m = load <32 x i64>, ptr %m_ptr %mask = icmp eq <32 x i64> %m, zeroinitializer %val = load <32 x i64>, ptr %val_ptr @@ -683,3 +640,6 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ret void } declare void @llvm.masked.store.v256i8.p0(<256 x i8>, ptr, i32, <256 x i1>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll index 5601bd5ee7a3a..805a3c640957b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -1346,93 +1346,48 @@ define <16 x i64> @vadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vadd_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB108_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB108_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vadd_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB108_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB108_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vadd_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; RV32-LABEL: vadd_vi_v32i64_unmasked: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB109_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB109_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v16, v24 -; RV32-NEXT: ret -; -; RV64-LABEL: vadd_vi_v32i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB109_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB109_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v8, v8, -1 -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v16, v16, -1 -; RV64-NEXT: ret +; CHECK-LABEL: vadd_vi_v32i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, -1 +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v16, -1 +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v } @@ -1440,49 +1395,26 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; FIXME: We don't match vadd.vi on RV32. define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vadd_vx_v32i64_evl12: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vadd_vx_v32i64_evl12: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vadd_vx_v32i64_evl12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v } define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vadd_vx_v32i64_evl27: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vadd_vx_v32i64_evl27: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV64-NEXT: vadd.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vadd_vx_v32i64_evl27: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll index d414be76672ab..c413dd86f3712 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll @@ -1139,18 +1139,16 @@ define <11 x i64> @vand_vv_v11i64_unmasked(<11 x i64> %va, <11 x i64> %b, i32 ze define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vand_vx_v11i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v16, v0 -; RV32-NEXT: lui a3, 341 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vmerge.vxm v24, v24, a0, v0 -; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vand_vx_v11i64: @@ -1167,16 +1165,16 @@ define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zero define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) { ; RV32-LABEL: vand_vx_v11i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 32 -; RV32-NEXT: lui a4, 341 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a4 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vmerge.vxm v16, v16, a0, v0 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vand_vx_v11i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll index 0b0d758ad8ded..6adc6ba9621a8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll @@ -1018,51 +1018,27 @@ define <16 x i64> @vmax_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext % declare <32 x i64> @llvm.vp.smax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vmax_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB74_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB74_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmax.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmax.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vmax_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB74_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB74_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vmax.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vmax.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vmax_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB74_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB74_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vmax.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.smax.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll index 98e630a0e59e5..baeb372c017e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll @@ -1017,51 +1017,27 @@ define <16 x i64> @vmaxu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext declare <32 x i64> @llvm.vp.umax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vmaxu_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB74_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB74_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmaxu.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmaxu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vmaxu_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB74_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB74_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vmaxu.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vmaxu.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vmaxu_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB74_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB74_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vmaxu.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vmaxu.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.umax.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll index a6e3764b37550..d0c21ce05c025 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll @@ -1018,51 +1018,27 @@ define <16 x i64> @vmin_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext % declare <32 x i64> @llvm.vp.smin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vmin_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB74_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB74_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmin.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmin.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vmin_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB74_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB74_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vmin.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vmin.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vmin_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB74_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB74_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vmin.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vmin.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.smin.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll index c59b65edd1ec1..a730ba4729d25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll @@ -1017,51 +1017,27 @@ define <16 x i64> @vminu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext declare <32 x i64> @llvm.vp.umin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vminu_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB74_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB74_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vminu.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vminu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vminu_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB74_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB74_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vminu.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vminu.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vminu_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB74_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB74_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vminu.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vminu.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.umin.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll index df2c83028e5df..c5dd6ac344a37 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -1359,93 +1359,48 @@ define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vsadd_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB108_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB108_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsadd_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB108_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB108_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsadd_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; RV32-LABEL: vsadd_vi_v32i64_unmasked: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB109_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB109_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v8, v8, v24 -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v16, v16, v24 -; RV32-NEXT: ret -; -; RV64-LABEL: vsadd_vi_v32i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB109_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB109_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v8, v8, -1 -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v16, v16, -1 -; RV64-NEXT: ret +; CHECK-LABEL: vsadd_vi_v32i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1 +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v } @@ -1453,59 +1408,31 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; FIXME: We don't match vsadd.vi on RV32. define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vsadd_vx_v32i64_evl12: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsadd_vx_v32i64_evl12: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsadd_vx_v32i64_evl12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v } define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vsadd_vx_v32i64_evl27: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsadd_vx_v32i64_evl27: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsadd_vx_v32i64_evl27: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll index f50dadf019910..17d9c437590a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll @@ -1355,93 +1355,48 @@ define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vsaddu_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB108_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB108_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsaddu_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB108_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB108_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsaddu_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; RV32-LABEL: vsaddu_vi_v32i64_unmasked: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB109_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB109_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v8, v8, v24 -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v16, v16, v24 -; RV32-NEXT: ret -; -; RV64-LABEL: vsaddu_vi_v32i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB109_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB109_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v8, v8, -1 -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v16, v16, -1 -; RV64-NEXT: ret +; CHECK-LABEL: vsaddu_vi_v32i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1 +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v } @@ -1449,59 +1404,31 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; FIXME: We don't match vsaddu.vi on RV32. define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vsaddu_vx_v32i64_evl12: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsaddu_vx_v32i64_evl12: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsaddu_vx_v32i64_evl12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v } define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vsaddu_vx_v32i64_evl27: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vsaddu_vx_v32i64_evl27: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vsaddu_vx_v32i64_evl27: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll index b82ca70477ba3..90e1b5ce55752 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -1399,95 +1399,50 @@ define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vssub_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB108_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB108_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssub_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB108_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB108_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssub_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; RV32-LABEL: vssub_vi_v32i64_unmasked: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB109_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB109_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v8, v8, v24 -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v16, v16, v24 -; RV32-NEXT: ret -; -; RV64-LABEL: vssub_vi_v32i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB109_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB109_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v8, v8, a2 -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v16, v16, a2 -; RV64-NEXT: ret +; CHECK-LABEL: vssub_vi_v32i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a2 +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a2 +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v } @@ -1495,61 +1450,33 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; FIXME: We don't match vssub.vi on RV32. define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vssub_vx_v32i64_evl12: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssub_vx_v32i64_evl12: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v16, v16, a0, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssub_vx_v32i64_evl12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v } define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vssub_vx_v32i64_evl27: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vssub.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssub_vx_v32i64_evl27: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV64-NEXT: vssub.vx v16, v16, a0, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssub_vx_v32i64_evl27: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll index 6d8ed563f02bd..59899ab8b9994 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -1394,95 +1394,50 @@ define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: vssubu_vx_v32i64: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB108_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB108_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssubu_vx_v32i64: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB108_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB108_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v8, v8, a2, v0.t -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v16, v16, a2, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssubu_vx_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a2, v0.t +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a2, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl) ret <32 x i64> %v } define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; RV32-LABEL: vssubu_vi_v32i64_unmasked: -; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB109_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB109_2: -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v8, v8, v24 -; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v16, v16, v24 -; RV32-NEXT: ret -; -; RV64-LABEL: vssubu_vi_v32i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB109_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 -; RV64-NEXT: .LBB109_2: -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v8, v8, a2 -; RV64-NEXT: addi a1, a0, -16 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v16, v16, a2 -; RV64-NEXT: ret +; CHECK-LABEL: vssubu_vi_v32i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a2 +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a2 +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v } @@ -1490,61 +1445,33 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; FIXME: We don't match vssubu.vi on RV32. define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vssubu_vx_v32i64_evl12: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssubu_vx_v32i64_evl12: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v16, v16, a0, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssubu_vx_v32i64_evl12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v } define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { -; RV32-LABEL: vssubu_vx_v32i64_evl27: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t -; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vssubu_vx_v32i64_evl27: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV64-NEXT: vssubu.vx v16, v16, a0, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: vssubu_vx_v32i64_evl27: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27) ret <32 x i64> %v } diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index dc27158cfb31f..162f7e34536a7 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -599,13 +599,13 @@ define void @test_srem_vec(ptr %X) nounwind { ; ; RV32MV-LABEL: test_srem_vec: ; RV32MV: # %bb.0: -; RV32MV-NEXT: addi sp, sp, -48 -; RV32MV-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s1, 36(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s2, 32(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s3, 28(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; RV32MV-NEXT: addi sp, sp, -64 +; RV32MV-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s2, 48(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s3, 44(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s4, 40(sp) # 4-byte Folded Spill ; RV32MV-NEXT: csrr a1, vlenb ; RV32MV-NEXT: slli a1, a1, 1 ; RV32MV-NEXT: sub sp, sp, a1 @@ -624,29 +624,33 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: srai s3, a0, 31 ; RV32MV-NEXT: srli a1, a1, 1 ; RV32MV-NEXT: slli a1, a1, 31 -; RV32MV-NEXT: lw a0, 0(s0) ; RV32MV-NEXT: srai s4, a1, 31 +; RV32MV-NEXT: lw a0, 0(s0) ; RV32MV-NEXT: slli a1, a3, 31 ; RV32MV-NEXT: srai a1, a1, 31 +; RV32MV-NEXT: li a2, 1 +; RV32MV-NEXT: sw a2, 20(sp) +; RV32MV-NEXT: li a2, -1 +; RV32MV-NEXT: sw a2, 16(sp) ; RV32MV-NEXT: li a2, 6 ; RV32MV-NEXT: li a3, 0 ; RV32MV-NEXT: call __moddi3 ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32MV-NEXT: vmv.v.x v8, a0 ; RV32MV-NEXT: vslide1down.vx v8, v8, a1 -; RV32MV-NEXT: addi a0, sp, 16 +; RV32MV-NEXT: addi a0, sp, 32 ; RV32MV-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; RV32MV-NEXT: li a2, 7 ; RV32MV-NEXT: mv a0, s2 ; RV32MV-NEXT: mv a1, s4 ; RV32MV-NEXT: li a3, 0 ; RV32MV-NEXT: call __moddi3 -; RV32MV-NEXT: addi a2, sp, 16 +; RV32MV-NEXT: addi a2, sp, 32 ; RV32MV-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32MV-NEXT: vslide1down.vx v8, v8, a0 ; RV32MV-NEXT: vslide1down.vx v8, v8, a1 -; RV32MV-NEXT: addi a0, sp, 16 +; RV32MV-NEXT: addi a0, sp, 32 ; RV32MV-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; RV32MV-NEXT: li a2, -5 ; RV32MV-NEXT: li a3, -1 @@ -654,18 +658,17 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: mv a1, s3 ; RV32MV-NEXT: call __moddi3 ; RV32MV-NEXT: addi a2, sp, 16 -; RV32MV-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32MV-NEXT: vlse64.v v8, (a2), zero +; RV32MV-NEXT: addi a2, sp, 32 +; RV32MV-NEXT: vl2r.v v10, (a2) # Unknown-size Folded Reload ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32MV-NEXT: vslide1down.vx v8, v8, a0 -; RV32MV-NEXT: vslide1down.vx v8, v8, a1 -; RV32MV-NEXT: vslidedown.vi v8, v8, 2 -; RV32MV-NEXT: li a0, 511 -; RV32MV-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32MV-NEXT: vmv.v.x v10, a0 -; RV32MV-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32MV-NEXT: vsext.vf4 v12, v10 -; RV32MV-NEXT: vand.vv v8, v8, v12 -; RV32MV-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32MV-NEXT: vslide1down.vx v10, v10, a0 +; RV32MV-NEXT: vslide1down.vx v10, v10, a1 +; RV32MV-NEXT: vslidedown.vi v10, v10, 2 +; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32MV-NEXT: vand.vv v8, v10, v8 +; RV32MV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32MV-NEXT: vmv.v.i v10, 1 ; RV32MV-NEXT: vmv.v.i v11, 0 ; RV32MV-NEXT: vsetivli zero, 3, e8, mf2, tu, ma @@ -712,13 +715,13 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: csrr a0, vlenb ; RV32MV-NEXT: slli a0, a0, 1 ; RV32MV-NEXT: add sp, sp, a0 -; RV32MV-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s1, 36(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s2, 32(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s3, 28(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s4, 24(sp) # 4-byte Folded Reload -; RV32MV-NEXT: addi sp, sp, 48 +; RV32MV-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s3, 44(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s4, 40(sp) # 4-byte Folded Reload +; RV32MV-NEXT: addi sp, sp, 64 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_srem_vec: From c2fc33204caca8c52b27425255bbc78c9e4d99e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 4 Sep 2024 20:41:31 -0700 Subject: [PATCH 175/425] [flang][cuda] Add c_devptr and bypass output semantic check (#107318) Add a builtin type for c_devptr since it will need some special handling for some function like c_f_pointer. `c_ptr` is defined as a builtin type and was raising a semantic error if you try to use it in a I/O statement. This patch add a check for c_ptr and c_devptr to bypass the semantic check and allow the variables of these types to be used in I/O. --- flang/lib/Semantics/check-io.cpp | 5 +++++ flang/module/__fortran_builtins.f90 | 4 ++++ flang/test/Lower/CUDA/cuda-devptr.cuf | 16 ++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp index 54e8e09cbf7e4..d7d2f0fa322cb 100644 --- a/flang/lib/Semantics/check-io.cpp +++ b/flang/lib/Semantics/check-io.cpp @@ -1171,6 +1171,11 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type, "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US, derived.name()); } + if (IsBuiltinDerivedType(&derived, "c_ptr") || + IsBuiltinDerivedType(&derived, "c_devptr")) { + // Bypass the check below for c_ptr and c_devptr. + return nullptr; + } if (const Symbol * bad{FindInaccessibleComponent(which, derived, scope)}) { return &context_.Say(where, diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90 index 44b0f17339cd9..a9d3ac897eb58 100644 --- a/flang/module/__fortran_builtins.f90 +++ b/flang/module/__fortran_builtins.f90 @@ -102,6 +102,10 @@ __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, & __builtin_gridDim integer, parameter, public :: __builtin_warpsize = 32 + + type, public, bind(c) :: __builtin_c_devptr + type(__builtin_c_ptr) :: cptr + end type intrinsic :: __builtin_fma intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, & diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf new file mode 100644 index 0000000000000..4e11e3c0fc8f8 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -0,0 +1,16 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran specific type + +subroutine sub1() + use iso_c_binding + use __fortran_builtins, only : c_devptr => __builtin_c_devptr + + type(c_ptr) :: ptr + type(c_devptr) :: dptr + print*,ptr + print*,dptr +end + +! CHECK-LABEL: func.func @_QPsub1() +! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType From aad699776496a80af5e062b446fe26a4313ff3e3 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 5 Sep 2024 12:04:33 +0800 Subject: [PATCH 176/425] [RISCV] Fold PseudoVMV_V_V with undef passthru, handling policy (#106943) If a vmv.v.v has an undef passthru then we can just replace it with its input operand, since the tail is completely undefined. This is a reattempt of #106840, but also checks to see if the input was a pseudo where we can relax its tail policy to undef. This also means we don't need to check for undef passthrus in foldVMV_V_V anymore because they will be handled by foldUndefPassthruVMV_V_V. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 42 +++++++++++++++++-- .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir | 1 - .../CodeGen/RISCV/rvv/vmv.v.v-peephole.ll | 4 +- .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 3 +- 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 35c3bc9708d91..026e0a365b8dc 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -66,6 +66,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool convertToWholeRegister(MachineInstr &MI) const; bool convertToUnmasked(MachineInstr &MI) const; bool convertVMergeToVMv(MachineInstr &MI) const; + bool foldUndefPassthruVMV_V_V(MachineInstr &MI); bool foldVMV_V_V(MachineInstr &MI); bool isAllOnesMask(const MachineInstr *MaskDef) const; @@ -472,6 +473,38 @@ bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO, return true; } +/// If a PseudoVMV_V_V's passthru is undef then we can replace it with its input +bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) { + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMV_V_V) + return false; + if (MI.getOperand(1).getReg() != RISCV::NoRegister) + return false; + + // If the input was a pseudo with a policy operand, we can give it a tail + // agnostic policy if MI's undef tail subsumes the input's. + MachineInstr *Src = MRI->getVRegDef(MI.getOperand(2).getReg()); + if (Src && !Src->hasUnmodeledSideEffects() && + MRI->hasOneUse(MI.getOperand(2).getReg()) && + RISCVII::hasVLOp(Src->getDesc().TSFlags) && + RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) && + getSEWLMULRatio(MI) == getSEWLMULRatio(*Src)) { + const MachineOperand &MIVL = MI.getOperand(3); + const MachineOperand &SrcVL = + Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); + + MachineOperand &SrcPolicy = + Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())); + + if (isVLKnownLE(MIVL, SrcVL)) + SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC); + } + + MRI->replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); + MI.eraseFromParent(); + V0Defs.erase(&MI); + return true; +} + /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL /// into it. /// @@ -531,9 +564,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { // If MI was tail agnostic and the VL didn't increase, preserve it. int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED; - bool TailAgnostic = (MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) || - Passthru.getReg() == RISCV::NoRegister; - if (TailAgnostic && isVLKnownLE(MI.getOperand(3), SrcVL)) + if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) && + isVLKnownLE(MI.getOperand(3), SrcVL)) Policy |= RISCVII::TAIL_AGNOSTIC; Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); @@ -584,6 +616,10 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { Changed |= convertToUnmasked(MI); Changed |= convertToWholeRegister(MI); Changed |= convertVMergeToVMv(MI); + if (foldUndefPassthruVMV_V_V(MI)) { + Changed |= true; + continue; // MI is erased + } Changed |= foldVMV_V_V(MI); } } diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir index 1419eede6ca9d..19a918148e6eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir @@ -15,7 +15,6 @@ body: | ; CHECK-NEXT: %avl:gprnox0 = COPY $x1 ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */ ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 $noreg, %true, %avl, 5 /* e32 */, 0 /* tu, mu */ %false:vr = COPY $v8 %true:vr = COPY $v9 %avl:gprnox0 = COPY $x1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll index c1602c912da63..7f248a39b54fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll @@ -199,10 +199,8 @@ define @unfoldable_mismatched_sew( %passthr define @undef_passthru( %passthru, %x, %y, iXLen %avl) { ; CHECK-LABEL: undef_passthru: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 ; CHECK-NEXT: ret %a = call @llvm.riscv.vadd.nxv1i64.nxv1i64( %passthru, %x, %y, iXLen %avl) %b = call @llvm.riscv.vmv.v.v.nxv1i64( undef, %a, iXLen %avl) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir index 7e2ac0e26f251..6858231bf0e6c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir @@ -69,8 +69,7 @@ body: | ; CHECK: liveins: $v8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %passthru:vr = COPY $v8 - ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ - ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */ %passthru:vr = COPY $v8 %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */ From ad89e617c703239518187912540b8ea811dc2eda Mon Sep 17 00:00:00 2001 From: Matt Hofmann Date: Thu, 5 Sep 2024 00:12:03 -0400 Subject: [PATCH 177/425] [MLIR][Python] Fix detached operation coming from `IfOp` constructor (#107286) Without this fix, `scf.if` operations would be created without a parent. Since `scf.if` operations often have no results, this caused silent bugs where the generated code was straight-up missing the operation. --- mlir/python/mlir/dialects/scf.py | 2 +- mlir/test/python/dialects/scf.py | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py index 7025f6e0f1a16..2d0047b76c702 100644 --- a/mlir/python/mlir/dialects/scf.py +++ b/mlir/python/mlir/dialects/scf.py @@ -87,7 +87,7 @@ def __init__(self, cond, results_=None, *, hasElse=False, loc=None, ip=None): operands.append(cond) results = [] results.extend(results_) - super().__init__(results, cond) + super().__init__(results, cond, loc=loc, ip=ip) self.regions[0].blocks.append(*[]) if hasElse: self.regions[1].blocks.append(*[]) diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py index 95a6de86b670d..de61f4613868f 100644 --- a/mlir/test/python/dialects/scf.py +++ b/mlir/test/python/dialects/scf.py @@ -278,6 +278,32 @@ def simple_if(cond): # CHECK: return +@constructAndPrintInModule +def testNestedIf(): + bool = IntegerType.get_signless(1) + i32 = IntegerType.get_signless(32) + + @func.FuncOp.from_py_func(bool, bool) + def nested_if(b, c): + if_op = scf.IfOp(b) + with InsertionPoint(if_op.then_block) as ip: + if_op = scf.IfOp(c, ip=ip) + with InsertionPoint(if_op.then_block): + one = arith.ConstantOp(i32, 1) + add = arith.AddIOp(one, one) + scf.YieldOp([]) + scf.YieldOp([]) + return + + +# CHECK: func @nested_if(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: i1) +# CHECK: scf.if %[[ARG0:.*]] +# CHECK: scf.if %[[ARG1:.*]] +# CHECK: %[[ONE:.*]] = arith.constant 1 +# CHECK: %[[ADD:.*]] = arith.addi %[[ONE]], %[[ONE]] +# CHECK: return + + @constructAndPrintInModule def testIfWithElse(): bool = IntegerType.get_signless(1) From 41c11ea2af743051013dfcc0fced5a450e2dc9b8 Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Wed, 4 Sep 2024 21:15:48 -0700 Subject: [PATCH 178/425] [HLSL] Remove variables that are used only in assert (#107299) Changes the assert to test the same condition without using the variables. This change is done in response to a comment [here](https://github.com/llvm/llvm-project/pull/106657#issuecomment-2327493439). --- clang/lib/Sema/SemaHLSL.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 65aeda4b7b613..d5ccd3815eb66 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -837,17 +837,10 @@ static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, Decl *TheDecl, RegisterType regType) { - // Samplers, UAVs, and SRVs are VarDecl types - VarDecl *TheVarDecl = dyn_cast(TheDecl); - // Cbuffers and Tbuffers are HLSLBufferDecl types - HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl); - // exactly one of these two types should be set - assert(((TheVarDecl && !CBufferOrTBuffer) || - (!TheVarDecl && CBufferOrTBuffer)) && - "either TheVarDecl or CBufferOrTBuffer should be set"); - (void)TheVarDecl; - (void)CBufferOrTBuffer; + assert(((isa(TheDecl) && !isa(TheDecl)) || + (!isa(TheDecl) && isa(TheDecl))) && + "expecting VarDecl or HLSLBufferDecl"); RegisterBindingFlags Flags = HLSLFillRegisterBindingFlags(S, TheDecl); assert((int)Flags.Other + (int)Flags.Resource + (int)Flags.Basic + From abbcfff706b33a8965afa9f2c520f60ad46f3b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 4 Sep 2024 21:16:00 -0700 Subject: [PATCH 179/425] Revert "[flang][cuda] Add c_devptr and bypass output semantic check" (#107349) Reverts llvm/llvm-project#107318 It breaks a test https://lab.llvm.org/buildbot/#/builders/143/builds/1933 --- flang/lib/Semantics/check-io.cpp | 5 ----- flang/module/__fortran_builtins.f90 | 4 ---- flang/test/Lower/CUDA/cuda-devptr.cuf | 16 ---------------- 3 files changed, 25 deletions(-) delete mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp index d7d2f0fa322cb..54e8e09cbf7e4 100644 --- a/flang/lib/Semantics/check-io.cpp +++ b/flang/lib/Semantics/check-io.cpp @@ -1171,11 +1171,6 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type, "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US, derived.name()); } - if (IsBuiltinDerivedType(&derived, "c_ptr") || - IsBuiltinDerivedType(&derived, "c_devptr")) { - // Bypass the check below for c_ptr and c_devptr. - return nullptr; - } if (const Symbol * bad{FindInaccessibleComponent(which, derived, scope)}) { return &context_.Say(where, diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90 index a9d3ac897eb58..44b0f17339cd9 100644 --- a/flang/module/__fortran_builtins.f90 +++ b/flang/module/__fortran_builtins.f90 @@ -102,10 +102,6 @@ __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, & __builtin_gridDim integer, parameter, public :: __builtin_warpsize = 32 - - type, public, bind(c) :: __builtin_c_devptr - type(__builtin_c_ptr) :: cptr - end type intrinsic :: __builtin_fma intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, & diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf deleted file mode 100644 index 4e11e3c0fc8f8..0000000000000 --- a/flang/test/Lower/CUDA/cuda-devptr.cuf +++ /dev/null @@ -1,16 +0,0 @@ -! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s - -! Test CUDA Fortran specific type - -subroutine sub1() - use iso_c_binding - use __fortran_builtins, only : c_devptr => __builtin_c_devptr - - type(c_ptr) :: ptr - type(c_devptr) :: dptr - print*,ptr - print*,dptr -end - -! CHECK-LABEL: func.func @_QPsub1() -! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType From 1465e23985904d55a014f3377c287ded45c0fa0c Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 5 Sep 2024 12:46:20 +0800 Subject: [PATCH 180/425] [RISCV][llvm] Handle `ptr` element type in `lowerDeinterleaveIntrinsicToLoad` and `lowerInterleaveIntrinsicToStore` (#107079) Resolve https://github.com/llvm/llvm-project/issues/106970 currently it returns 0 fixed size for `ptr` element type. The `ptr` element size should depend on `XLen` which is 64 in riscv64 and 32 in riscv32 respectively. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 12 +++++----- .../RISCV/rvv/vector-deinterleave-load.ll | 22 +++++++++++++++++-- .../RISCV/rvv/vector-interleave-store.ll | 21 ++++++++++++++++-- 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a9061a05c7c67..d400b2ea1ca2c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21980,10 +21980,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( VectorType *VTy = cast(DI->getOperand(0)->getType()); VectorType *ResVTy = cast(DI->getType()->getContainedType(0)); + const DataLayout &DL = LI->getDataLayout(); if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), - LI->getDataLayout())) + LI->getPointerAddressSpace(), DL)) return false; Function *VlsegNFunc; @@ -22005,7 +22005,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, Intrinsic::riscv_vlseg8}; - unsigned SEW = ResVTy->getElementType()->getScalarSizeInBits(); + unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType()); unsigned NumElts = ResVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( LI->getContext(), "riscv.vector.tuple", @@ -22051,10 +22051,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( VectorType *VTy = cast(II->getType()); VectorType *InVTy = cast(II->getOperand(0)->getType()); + const DataLayout &DL = SI->getDataLayout(); if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), - SI->getDataLayout())) + SI->getPointerAddressSpace(), DL)) return false; Function *VssegNFunc; @@ -22075,7 +22075,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, Intrinsic::riscv_vsseg8}; - unsigned SEW = InVTy->getElementType()->getScalarSizeInBits(); + unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( SI->getContext(), "riscv.vector.tuple", diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index e2f956ca03ff8..54373d94f8f5f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV64 %s ; Integers @@ -263,9 +263,27 @@ define {, } @vector_deinterleave_load_ ret {, } %retval } +define {, } @vector_deinterleave_load_nxv2p0_nxv4p0(ptr %p) { +; RV32-LABEL: vector_deinterleave_load_nxv2p0_nxv4p0: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave_load_nxv2p0_nxv4p0: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vlseg2e64.v v8, (a0) +; RV64-NEXT: ret + %vec = load , ptr %p + %retval = call {, } @llvm.vector.deinterleave2.nxv4p0( %vec) + ret {, } %retval +} + declare {,} @llvm.vector.deinterleave2.nxv4f16() declare {, } @llvm.vector.deinterleave2.nxv8f16() declare {, } @llvm.vector.deinterleave2.nxv4f32() declare {, } @llvm.vector.deinterleave2.nxv16f16() declare {, } @llvm.vector.deinterleave2.nxv8f32() declare {, } @llvm.vector.deinterleave2.nxv4f64() +declare {, } @llvm.vector.deinterleave2.nxv4p0() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 5ebf63f0a4411..a06aa2d02b11b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV64 %s ; Integers @@ -218,6 +218,22 @@ define void @vector_interleave_store_nxv4f64_nxv2f64( %a, < ret void } +define void @vector_interleave_store_nxv4p0_nxv2p0( %a, %b, ptr %p) { +; RV32-LABEL: vector_interleave_store_nxv4p0_nxv2p0: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_nxv4p0_nxv2p0: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vsseg2e64.v v8, (a0) +; RV64-NEXT: ret + %res = call @llvm.vector.interleave2.nxv4p0( %a, %b) + store %res, ptr %p + ret void +} declare @llvm.vector.interleave2.nxv4f16(, ) declare @llvm.vector.interleave2.nxv8f16(, ) @@ -225,3 +241,4 @@ declare @llvm.vector.interleave2.nxv4f32( @llvm.vector.interleave2.nxv16f16(, ) declare @llvm.vector.interleave2.nxv8f32(, ) declare @llvm.vector.interleave2.nxv4f64(, ) +declare @llvm.vector.interleave2.nxv4p0(, ) From da8fb7f4dddf48b2395f80dc09efffe38efa3d2f Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 5 Sep 2024 12:47:00 +0800 Subject: [PATCH 181/425] [clang][RISCV] Fix typo of vector crypto in SemaRISCV.cpp. NFC (#106485) --- clang/lib/Sema/SemaRISCV.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index abf8e4ac2f3e8..56d6f12fbc6e4 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -733,7 +733,7 @@ bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI, if (ElemSize == 64 && !TI.hasFeature("zvknhb")) return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_requires_extension) - << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb"; + << /* IsExtension */ true << TheCall->getSourceRange() << "zvknhb"; return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, ElemSize * 4) || From 845d8d909c37c61298d49c0e91949c669ca15215 Mon Sep 17 00:00:00 2001 From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:05:01 +0800 Subject: [PATCH 182/425] [RISCV][TTI] Add cost of typebased cast VPIntrinsics with functionalOPC. (#97797) This patch make the instruction cost of type-based cast VP intrinsics will be same as their non-VP counterpart. This is the following patch of [#93435](https://github.com/llvm/llvm-project/pull/93435) --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 22 +- llvm/test/Analysis/CostModel/RISCV/cast.ll | 3409 ++++++++++++++++- 2 files changed, 3427 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e655200e7a895..e809e15eacf69 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1011,8 +1011,26 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::vp_frem: { std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); - if (FOp) - return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind); + assert(FOp.has_value()); + return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind); + break; + } + // vp int cast ops. + case Intrinsic::vp_trunc: + case Intrinsic::vp_zext: + case Intrinsic::vp_sext: + // vp float cast ops. + case Intrinsic::vp_fptoui: + case Intrinsic::vp_fptosi: + case Intrinsic::vp_uitofp: + case Intrinsic::vp_sitofp: + case Intrinsic::vp_fptrunc: + case Intrinsic::vp_fpext: { + std::optional FOp = + VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); + assert(FOp.has_value() && !ICA.getArgTypes().empty()); + return getCastInstrCost(*FOp, RetTy, ICA.getArgTypes()[0], + TTI::CastContextHint::None, CostKind); break; } } diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index c1759b8b03d0f..5650d2cf90eac 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64 define void @sext() { ; RV32-LABEL: 'sext' @@ -14,6 +14,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64> @@ -24,6 +34,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64> @@ -34,6 +54,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64> @@ -44,6 +74,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64> @@ -54,6 +94,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64> @@ -64,6 +114,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64> @@ -74,6 +134,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64> @@ -84,6 +154,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext undef to @@ -94,6 +174,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call @llvm.vp.sext.nxv1i8.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext undef to @@ -104,6 +194,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call @llvm.vp.sext.nxv2i8.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext undef to @@ -114,6 +214,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call @llvm.vp.sext.nxv4i8.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext undef to @@ -124,6 +234,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call @llvm.vp.sext.nxv8i8.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext undef to @@ -134,6 +254,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i8.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext undef to @@ -144,6 +274,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i8.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext undef to @@ -154,6 +294,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i8.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext undef to @@ -164,6 +314,16 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'sext' @@ -177,6 +337,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64> @@ -187,6 +357,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64> @@ -197,6 +377,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64> @@ -207,6 +397,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64> @@ -217,6 +417,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64> @@ -227,6 +437,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64> @@ -237,6 +457,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64> @@ -247,6 +477,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext undef to @@ -257,6 +497,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call @llvm.vp.sext.nxv1i8.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext undef to @@ -267,6 +517,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call @llvm.vp.sext.nxv2i8.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext undef to @@ -277,6 +537,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call @llvm.vp.sext.nxv4i8.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext undef to @@ -287,6 +557,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call @llvm.vp.sext.nxv8i8.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext undef to @@ -297,6 +577,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i8.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext undef to @@ -307,6 +597,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i8.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext undef to @@ -317,6 +617,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i8.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext undef to @@ -327,6 +637,16 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16> @@ -340,6 +660,17 @@ define void @sext() { %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32> %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64> + %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef) + %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16> %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32> %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64> @@ -351,6 +682,17 @@ define void @sext() { %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32> %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64> + %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef) + %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16> %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32> %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64> @@ -362,6 +704,17 @@ define void @sext() { %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32> %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64> + %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef) + %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16> %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32> %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64> @@ -373,6 +726,17 @@ define void @sext() { %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32> %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64> + %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef) + %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16> %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32> %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64> @@ -384,6 +748,17 @@ define void @sext() { %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> + %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef) + %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64> @@ -395,6 +770,17 @@ define void @sext() { %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> + %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef) + %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64> @@ -406,6 +792,17 @@ define void @sext() { %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> + %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef) + %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64> @@ -417,6 +814,17 @@ define void @sext() { %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> + %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef) + %nxv1i8_nxv1i16 = sext undef to %nxv1i8_nxv1i32 = sext undef to %nxv1i8_nxv1i64 = sext undef to @@ -428,6 +836,17 @@ define void @sext() { %nxv1i1_nxv1i32 = sext undef to %nxv1i1_nxv1i64 = sext undef to + %vp_nxv1i8_nxv1i16 = call @llvm.vp.sext.nxv1i8.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i32 = call @llvm.vp.sext.nxv1i8.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i64 = call @llvm.vp.sext.nxv1i8.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i32 = call @llvm.vp.sext.nxv1i16.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i64 = call @llvm.vp.sext.nxv1i16.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i64 = call @llvm.vp.sext.nxv1i32.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i8 = call @llvm.vp.sext.nxv1i1.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i16 = call @llvm.vp.sext.nxv1i1.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i32 = call @llvm.vp.sext.nxv1i1.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i64 = call @llvm.vp.sext.nxv1i1.nxv1i64( undef, undef, i32 undef) + %nxv2i8_nxv2i16 = sext undef to %nxv2i8_nxv2i32 = sext undef to %nxv2i8_nxv2i64 = sext undef to @@ -439,6 +858,17 @@ define void @sext() { %nxv2i1_nxv2i32 = sext undef to %nxv2i1_nxv2i64 = sext undef to + %vp_nxv2i8_nxv2i16 = call @llvm.vp.sext.nxv2i8.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i32 = call @llvm.vp.sext.nxv2i8.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i64 = call @llvm.vp.sext.nxv2i8.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i32 = call @llvm.vp.sext.nxv2i16.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i64 = call @llvm.vp.sext.nxv2i16.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i64 = call @llvm.vp.sext.nxv2i32.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i8 = call @llvm.vp.sext.nxv2i1.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i16 = call @llvm.vp.sext.nxv2i1.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i32 = call @llvm.vp.sext.nxv2i1.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i64 = call @llvm.vp.sext.nxv2i1.nxv2i64( undef, undef, i32 undef) + %nxv4i8_nxv4i16 = sext undef to %nxv4i8_nxv4i32 = sext undef to %nxv4i8_nxv4i64 = sext undef to @@ -450,6 +880,17 @@ define void @sext() { %nxv4i1_nxv4i32 = sext undef to %nxv4i1_nxv4i64 = sext undef to + %vp_nxv4i8_nxv4i16 = call @llvm.vp.sext.nxv4i8.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i32 = call @llvm.vp.sext.nxv4i8.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i64 = call @llvm.vp.sext.nxv4i8.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i32 = call @llvm.vp.sext.nxv4i16.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i64 = call @llvm.vp.sext.nxv4i16.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i64 = call @llvm.vp.sext.nxv4i32.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i8 = call @llvm.vp.sext.nxv4i1.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i16 = call @llvm.vp.sext.nxv4i1.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i32 = call @llvm.vp.sext.nxv4i1.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i64 = call @llvm.vp.sext.nxv4i1.nxv4i64( undef, undef, i32 undef) + %nxv8i8_nxv8i16 = sext undef to %nxv8i8_nxv8i32 = sext undef to %nxv8i8_nxv8i64 = sext undef to @@ -461,6 +902,17 @@ define void @sext() { %nxv8i1_nxv8i32 = sext undef to %nxv8i1_nxv8i64 = sext undef to + %vp_nxv8i8_nxv8i16 = call @llvm.vp.sext.nxv8i8.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i32 = call @llvm.vp.sext.nxv8i8.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i64 = call @llvm.vp.sext.nxv8i8.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i32 = call @llvm.vp.sext.nxv8i16.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i64 = call @llvm.vp.sext.nxv8i16.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i64 = call @llvm.vp.sext.nxv8i32.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i8 = call @llvm.vp.sext.nxv8i1.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i16 = call @llvm.vp.sext.nxv8i1.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i32 = call @llvm.vp.sext.nxv8i1.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i64 = call @llvm.vp.sext.nxv8i1.nxv8i64( undef, undef, i32 undef) + %nxv16i8_nxv16i16 = sext undef to %nxv16i8_nxv16i32 = sext undef to %nxv16i8_nxv16i64 = sext undef to @@ -472,6 +924,17 @@ define void @sext() { %nxv16i1_nxv16i32 = sext undef to %nxv16i1_nxv16i64 = sext undef to + %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i8.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i8.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i8.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i32 = call @llvm.vp.sext.nxv16i16.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i64 = call @llvm.vp.sext.nxv16i16.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i64 = call @llvm.vp.sext.nxv16i32.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i1.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i1.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i1.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i1.nxv16i64( undef, undef, i32 undef) + %nxv32i8_nxv32i16 = sext undef to %nxv32i8_nxv32i32 = sext undef to %nxv32i8_nxv32i64 = sext undef to @@ -483,6 +946,17 @@ define void @sext() { %nxv32i1_nxv32i32 = sext undef to %nxv32i1_nxv32i64 = sext undef to + %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i8.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i8.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i8.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i32 = call @llvm.vp.sext.nxv32i16.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i64 = call @llvm.vp.sext.nxv32i16.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i32.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i1.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i1.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i1.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i1.nxv32i64( undef, undef, i32 undef) + %nxv64i8_nxv64i16 = sext undef to %nxv64i8_nxv64i32 = sext undef to %nxv64i8_nxv64i64 = sext undef to @@ -494,6 +968,17 @@ define void @sext() { %nxv64i1_nxv64i32 = sext undef to %nxv64i1_nxv64i64 = sext undef to + %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i8.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i8.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i8.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i32 = call @llvm.vp.sext.nxv64i16.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i16.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i32.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i1.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i1.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i1.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i1.nxv64i64( undef, undef, i32 undef) + %nxv128i8_nxv128i16 = sext undef to %nxv128i8_nxv128i32 = sext undef to %nxv128i8_nxv128i128 = sext undef to @@ -505,6 +990,17 @@ define void @sext() { %nxv128i1_nxv128i32 = sext undef to %nxv128i1_nxv128i128 = sext undef to + %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i8.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i8.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i128 = call @llvm.vp.sext.nxv128i8.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i16.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i16.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i32.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i1.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i1.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i1.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i1.nxv128i128( undef, undef, i32 undef) + ret void } @@ -520,6 +1016,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64> @@ -530,6 +1036,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64> @@ -540,6 +1056,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64> @@ -550,6 +1076,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64> @@ -560,6 +1096,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64> @@ -570,6 +1116,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64> @@ -580,6 +1136,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64> @@ -590,6 +1156,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext undef to @@ -600,6 +1176,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call @llvm.vp.zext.nxv1i8.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext undef to @@ -610,6 +1196,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call @llvm.vp.zext.nxv2i8.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext undef to @@ -620,6 +1216,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call @llvm.vp.zext.nxv4i8.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext undef to @@ -630,6 +1236,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call @llvm.vp.zext.nxv8i8.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext undef to @@ -640,6 +1256,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i8.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext undef to @@ -650,6 +1276,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i8.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext undef to @@ -660,6 +1296,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i8.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext undef to @@ -670,6 +1316,16 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'zext' @@ -683,6 +1339,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64> @@ -693,6 +1359,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64> @@ -703,6 +1379,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64> @@ -713,6 +1399,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64> @@ -723,6 +1419,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64> @@ -733,6 +1439,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64> @@ -743,6 +1459,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64> @@ -753,6 +1479,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext undef to @@ -763,6 +1499,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call @llvm.vp.zext.nxv1i8.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext undef to @@ -773,6 +1519,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call @llvm.vp.zext.nxv2i8.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext undef to @@ -783,6 +1539,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call @llvm.vp.zext.nxv4i8.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext undef to @@ -793,6 +1559,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call @llvm.vp.zext.nxv8i8.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext undef to @@ -803,6 +1579,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i8.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext undef to @@ -813,6 +1599,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i8.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext undef to @@ -823,6 +1619,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i8.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext undef to @@ -833,6 +1639,16 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16> @@ -846,6 +1662,17 @@ define void @zext() { %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32> %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64> + %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef) + %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16> %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32> %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64> @@ -857,6 +1684,17 @@ define void @zext() { %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32> %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64> + %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef) + %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16> %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32> %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64> @@ -868,6 +1706,17 @@ define void @zext() { %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32> %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64> + %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef) + %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16> %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32> %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64> @@ -879,6 +1728,17 @@ define void @zext() { %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32> %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64> + %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef) + %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16> %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32> %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64> @@ -890,6 +1750,17 @@ define void @zext() { %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> + %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef) + %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64> @@ -901,6 +1772,17 @@ define void @zext() { %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> + %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef) + %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64> @@ -912,6 +1794,17 @@ define void @zext() { %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> + %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef) + %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64> @@ -923,6 +1816,17 @@ define void @zext() { %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> + %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef) + %nxv1i8_nxv1i16 = zext undef to %nxv1i8_nxv1i32 = zext undef to %nxv1i8_nxv1i64 = zext undef to @@ -934,6 +1838,17 @@ define void @zext() { %nxv1i1_nxv1i32 = zext undef to %nxv1i1_nxv1i64 = zext undef to + %vp_nxv1i8_nxv1i16 = call @llvm.vp.zext.nxv1i8.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i32 = call @llvm.vp.zext.nxv1i8.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i64 = call @llvm.vp.zext.nxv1i8.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i32 = call @llvm.vp.zext.nxv1i16.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i64 = call @llvm.vp.zext.nxv1i16.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i64 = call @llvm.vp.zext.nxv1i32.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i8 = call @llvm.vp.zext.nxv1i1.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i16 = call @llvm.vp.zext.nxv1i1.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i32 = call @llvm.vp.zext.nxv1i1.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i64 = call @llvm.vp.zext.nxv1i1.nxv1i64( undef, undef, i32 undef) + %nxv2i8_nxv2i16 = zext undef to %nxv2i8_nxv2i32 = zext undef to %nxv2i8_nxv2i64 = zext undef to @@ -945,6 +1860,17 @@ define void @zext() { %nxv2i1_nxv2i32 = zext undef to %nxv2i1_nxv2i64 = zext undef to + %vp_nxv2i8_nxv2i16 = call @llvm.vp.zext.nxv2i8.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i32 = call @llvm.vp.zext.nxv2i8.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i64 = call @llvm.vp.zext.nxv2i8.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i32 = call @llvm.vp.zext.nxv2i16.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i64 = call @llvm.vp.zext.nxv2i16.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i64 = call @llvm.vp.zext.nxv2i32.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i8 = call @llvm.vp.zext.nxv2i1.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i16 = call @llvm.vp.zext.nxv2i1.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i32 = call @llvm.vp.zext.nxv2i1.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i64 = call @llvm.vp.zext.nxv2i1.nxv2i64( undef, undef, i32 undef) + %nxv4i8_nxv4i16 = zext undef to %nxv4i8_nxv4i32 = zext undef to %nxv4i8_nxv4i64 = zext undef to @@ -956,6 +1882,17 @@ define void @zext() { %nxv4i1_nxv4i32 = zext undef to %nxv4i1_nxv4i64 = zext undef to + %vp_nxv4i8_nxv4i16 = call @llvm.vp.zext.nxv4i8.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i32 = call @llvm.vp.zext.nxv4i8.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i64 = call @llvm.vp.zext.nxv4i8.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i32 = call @llvm.vp.zext.nxv4i16.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i64 = call @llvm.vp.zext.nxv4i16.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i64 = call @llvm.vp.zext.nxv4i32.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i8 = call @llvm.vp.zext.nxv4i1.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i16 = call @llvm.vp.zext.nxv4i1.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i32 = call @llvm.vp.zext.nxv4i1.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i64 = call @llvm.vp.zext.nxv4i1.nxv4i64( undef, undef, i32 undef) + %nxv8i8_nxv8i16 = zext undef to %nxv8i8_nxv8i32 = zext undef to %nxv8i8_nxv8i64 = zext undef to @@ -967,6 +1904,17 @@ define void @zext() { %nxv8i1_nxv8i32 = zext undef to %nxv8i1_nxv8i64 = zext undef to + %vp_nxv8i8_nxv8i16 = call @llvm.vp.zext.nxv8i8.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i32 = call @llvm.vp.zext.nxv8i8.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i64 = call @llvm.vp.zext.nxv8i8.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i32 = call @llvm.vp.zext.nxv8i16.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i64 = call @llvm.vp.zext.nxv8i16.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i64 = call @llvm.vp.zext.nxv8i32.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i8 = call @llvm.vp.zext.nxv8i1.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i16 = call @llvm.vp.zext.nxv8i1.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i32 = call @llvm.vp.zext.nxv8i1.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i64 = call @llvm.vp.zext.nxv8i1.nxv8i64( undef, undef, i32 undef) + %nxv16i8_nxv16i16 = zext undef to %nxv16i8_nxv16i32 = zext undef to %nxv16i8_nxv16i64 = zext undef to @@ -978,6 +1926,17 @@ define void @zext() { %nxv16i1_nxv16i32 = zext undef to %nxv16i1_nxv16i64 = zext undef to + %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i8.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i8.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i8.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i32 = call @llvm.vp.zext.nxv16i16.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i64 = call @llvm.vp.zext.nxv16i16.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i64 = call @llvm.vp.zext.nxv16i32.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i1.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i1.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i1.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i1.nxv16i64( undef, undef, i32 undef) + %nxv32i8_nxv32i16 = zext undef to %nxv32i8_nxv32i32 = zext undef to %nxv32i8_nxv32i64 = zext undef to @@ -989,6 +1948,17 @@ define void @zext() { %nxv32i1_nxv32i32 = zext undef to %nxv32i1_nxv32i64 = zext undef to + %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i8.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i8.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i8.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i32 = call @llvm.vp.zext.nxv32i16.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i64 = call @llvm.vp.zext.nxv32i16.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i32.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i1.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i1.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i1.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i1.nxv32i64( undef, undef, i32 undef) + %nxv64i8_nxv64i16 = zext undef to %nxv64i8_nxv64i32 = zext undef to %nxv64i8_nxv64i64 = zext undef to @@ -1000,6 +1970,17 @@ define void @zext() { %nxv64i1_nxv64i32 = zext undef to %nxv64i1_nxv64i64 = zext undef to + %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i8.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i8.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i8.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i32 = call @llvm.vp.zext.nxv64i16.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i16.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i32.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i1.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i1.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i1.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i1.nxv64i64( undef, undef, i32 undef) + %nxv128i8_nxv128i16 = zext undef to %nxv128i8_nxv128i32 = zext undef to %nxv128i8_nxv128i128 = zext undef to @@ -1011,6 +1992,17 @@ define void @zext() { %nxv128i1_nxv128i32 = zext undef to %nxv128i1_nxv128i128 = zext undef to + %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i8.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i8.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i128 = call @llvm.vp.zext.nxv128i8.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i16.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i16.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i32.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i1.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i1.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i1.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i1.nxv128i128( undef, undef, i32 undef) + ret void } @@ -1021,6 +2013,11 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6> ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> @@ -1031,6 +2028,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8> @@ -1041,6 +2048,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8> @@ -1051,6 +2068,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8> @@ -1061,6 +2088,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8> @@ -1071,6 +2108,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8> @@ -1081,6 +2128,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8> @@ -1091,6 +2148,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8> @@ -1101,6 +2168,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc undef to @@ -1111,6 +2188,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call @llvm.vp.trunc.nxv1i32.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc undef to @@ -1121,6 +2208,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call @llvm.vp.trunc.nxv2i32.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc undef to @@ -1131,6 +2228,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call @llvm.vp.trunc.nxv4i32.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc undef to @@ -1141,6 +2248,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call @llvm.vp.trunc.nxv8i32.nxv8i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc undef to @@ -1151,6 +2268,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call @llvm.vp.trunc.nxv16i32.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc undef to @@ -1161,6 +2288,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call @llvm.vp.trunc.nxv32i32.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i8 = trunc undef to @@ -1171,6 +2308,16 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i1 = trunc undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %vp_nxv64i64_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64i64_nxv64i32 = call @llvm.vp.trunc.nxv64i32.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'trunc' @@ -1179,6 +2326,11 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6> ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> @@ -1189,6 +2341,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8> @@ -1199,6 +2361,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8> @@ -1209,6 +2381,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8> @@ -1219,6 +2401,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8> @@ -1229,6 +2421,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8> @@ -1239,6 +2441,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8> @@ -1249,6 +2461,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8> @@ -1259,6 +2481,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc undef to @@ -1269,6 +2501,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call @llvm.vp.trunc.nxv1i32.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc undef to @@ -1279,6 +2521,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call @llvm.vp.trunc.nxv2i32.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc undef to @@ -1289,6 +2541,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call @llvm.vp.trunc.nxv4i32.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc undef to @@ -1299,6 +2561,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call @llvm.vp.trunc.nxv8i32.nxv8i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc undef to @@ -1309,6 +2581,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call @llvm.vp.trunc.nxv16i32.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc undef to @@ -1319,6 +2601,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call @llvm.vp.trunc.nxv32i32.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc undef to @@ -1329,6 +2621,16 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call @llvm.vp.trunc.nxv64i32.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -1338,6 +2640,12 @@ define void @trunc() { %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> + %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i16.v2i2(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i16.v2i4(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i16.v2i6(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i4.v2i2(<2 x i4> undef, <2 x i1> undef, i32 undef) + %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i6.v2i4(<2 x i6> undef, <2 x i1> undef, i32 undef) + %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> @@ -1349,6 +2657,17 @@ define void @trunc() { %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1> %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1> + %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i16.v2i8(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i32.v2i8(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i64.v2i8(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i32.v2i16(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i64.v2i16(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i64.v2i32(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i8.v2i1(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i16.v2i1(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i32.v2i1(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i64.v2i1(<2 x i64> undef, <2 x i1> undef, i32 undef) + %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8> %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8> %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8> @@ -1360,6 +2679,17 @@ define void @trunc() { %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1> %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1> + %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i16.v4i8(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i32.v4i8(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i64.v4i8(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i32.v4i16(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i64.v4i16(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i64.v4i32(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i8.v4i1(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i16.v4i1(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i32.v4i1(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i64.v4i1(<4 x i64> undef, <4 x i1> undef, i32 undef) + %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8> %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8> %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8> @@ -1371,6 +2701,17 @@ define void @trunc() { %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1> %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1> + %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i16.v8i8(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i32.v8i8(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i64.v8i8(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i32.v8i16(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i64.v8i16(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i64.v8i32(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i8.v8i1(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i16.v8i1(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i32.v8i1(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i64.v8i1(<8 x i64> undef, <8 x i1> undef, i32 undef) + %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8> %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8> %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8> @@ -1382,6 +2723,17 @@ define void @trunc() { %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1> %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1> + %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i16.v16i8(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i32.v16i8(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i64.v16i8(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i32.v16i16(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i64.v16i16(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i64.v16i32(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i8.v16i1(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i16.v16i1(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i32.v16i1(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i64.v16i1(<16 x i64> undef, <16 x i1> undef, i32 undef) + %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8> %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8> %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8> @@ -1393,6 +2745,17 @@ define void @trunc() { %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1> %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1> + %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i16.v32i8(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i32.v32i8(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i64.v32i8(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i32.v32i16(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i64.v32i16(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i64.v32i32(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i8.v32i1(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i16.v32i1(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i32.v32i1(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i64.v32i1(<32 x i64> undef, <32 x i1> undef, i32 undef) + %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8> %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8> %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8> @@ -1404,6 +2767,17 @@ define void @trunc() { %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> + %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i16.v64i8(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i32.v64i8(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i64.v64i8(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i32.v64i16(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i64.v64i16(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i64.v64i32(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i8.v64i1(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i16.v64i1(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i32.v64i1(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i64.v64i1(<64 x i64> undef, <64 x i1> undef, i32 undef) + %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8> @@ -1415,6 +2789,17 @@ define void @trunc() { %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> + %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i16.v128i8(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i32.v128i8(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i64.v128i8(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i32.v128i16(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i64.v128i16(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i64.v128i32(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i8.v128i1(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i16.v128i1(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i32.v128i1(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i64.v128i1(<128 x i64> undef, <128 x i1> undef, i32 undef) + %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8> @@ -1426,6 +2811,17 @@ define void @trunc() { %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> + %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i16.v256i8(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i32.v256i8(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i64.v256i8(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i32.v256i16(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i64.v256i16(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i64.v256i32(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i8.v256i1(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i16.v256i1(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i32.v256i1(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i64.v256i1(<256 x i64> undef, <256 x i1> undef, i32 undef) + %nxv1i16_nxv1i8 = trunc undef to %nxv1i32_nxv1i8 = trunc undef to %nxv1i64_nxv1i8 = trunc undef to @@ -1437,6 +2833,17 @@ define void @trunc() { %nxv1i32_nxv1i1 = trunc undef to %nxv1i64_nxv1i1 = trunc undef to + %vp_nxv1i16_nxv1i8 = call @llvm.vp.trunc.nxv1i16.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i8 = call @llvm.vp.trunc.nxv1i32.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i8 = call @llvm.vp.trunc.nxv1i64.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i16 = call @llvm.vp.trunc.nxv1i32.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i16 = call @llvm.vp.trunc.nxv1i64.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i32 = call @llvm.vp.trunc.nxv1i64.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i1 = call @llvm.vp.trunc.nxv1i8.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i1 = call @llvm.vp.trunc.nxv1i16.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i1 = call @llvm.vp.trunc.nxv1i32.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i1 = call @llvm.vp.trunc.nxv1i64.nxv1i1( undef, undef, i32 undef) + %nxv2i16_nxv2i8 = trunc undef to %nxv2i32_nxv2i8 = trunc undef to %nxv2i64_nxv2i8 = trunc undef to @@ -1448,6 +2855,17 @@ define void @trunc() { %nxv2i32_nxv2i1 = trunc undef to %nxv2i64_nxv2i1 = trunc undef to + %vp_nxv2i16_nxv2i8 = call @llvm.vp.trunc.nxv2i16.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i8 = call @llvm.vp.trunc.nxv2i32.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i8 = call @llvm.vp.trunc.nxv2i64.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i16 = call @llvm.vp.trunc.nxv2i32.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i16 = call @llvm.vp.trunc.nxv2i64.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i32 = call @llvm.vp.trunc.nxv2i64.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i1 = call @llvm.vp.trunc.nxv2i8.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i1 = call @llvm.vp.trunc.nxv2i16.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i1 = call @llvm.vp.trunc.nxv2i32.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i1 = call @llvm.vp.trunc.nxv2i64.nxv2i1( undef, undef, i32 undef) + %nxv4i16_nxv4i8 = trunc undef to %nxv4i32_nxv4i8 = trunc undef to %nxv4i64_nxv4i8 = trunc undef to @@ -1459,6 +2877,17 @@ define void @trunc() { %nxv4i32_nxv4i1 = trunc undef to %nxv4i64_nxv4i1 = trunc undef to + %vp_nxv4i16_nxv4i8 = call @llvm.vp.trunc.nxv4i16.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i8 = call @llvm.vp.trunc.nxv4i32.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i8 = call @llvm.vp.trunc.nxv4i64.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i16 = call @llvm.vp.trunc.nxv4i32.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i16 = call @llvm.vp.trunc.nxv4i64.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i32 = call @llvm.vp.trunc.nxv4i64.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i1 = call @llvm.vp.trunc.nxv4i8.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i1 = call @llvm.vp.trunc.nxv4i16.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i1 = call @llvm.vp.trunc.nxv4i32.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i1 = call @llvm.vp.trunc.nxv4i64.nxv4i1( undef, undef, i32 undef) + %nxv8i16_nxv8i8 = trunc undef to %nxv8i32_nxv8i8 = trunc undef to %nxv8i64_nxv8i8 = trunc undef to @@ -1470,6 +2899,17 @@ define void @trunc() { %nxv8i32_nxv8i1 = trunc undef to %nxv8i64_nxv8i1 = trunc undef to + %vp_nxv8i16_nxv8i8 = call @llvm.vp.trunc.nxv8i16.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i8 = call @llvm.vp.trunc.nxv8i32.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i8 = call @llvm.vp.trunc.nxv8i64.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i16 = call @llvm.vp.trunc.nxv8i32.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i16 = call @llvm.vp.trunc.nxv8i64.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i32 = call @llvm.vp.trunc.nxv8i64.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i1 = call @llvm.vp.trunc.nxv8i8.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i1 = call @llvm.vp.trunc.nxv8i16.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i1 = call @llvm.vp.trunc.nxv8i32.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i1 = call @llvm.vp.trunc.nxv8i64.nxv8i1( undef, undef, i32 undef) + %nxv16i16_nxv16i8 = trunc undef to %nxv16i32_nxv16i8 = trunc undef to %nxv16i64_nxv16i8 = trunc undef to @@ -1481,6 +2921,17 @@ define void @trunc() { %nxv16i32_nxv16i1 = trunc undef to %nxv16i64_nxv16i1 = trunc undef to + %vp_nxv16i16_nxv16i8 = call @llvm.vp.trunc.nxv16i16.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i8 = call @llvm.vp.trunc.nxv16i32.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i8 = call @llvm.vp.trunc.nxv16i64.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i16 = call @llvm.vp.trunc.nxv16i32.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i16 = call @llvm.vp.trunc.nxv16i64.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i32 = call @llvm.vp.trunc.nxv16i64.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i1 = call @llvm.vp.trunc.nxv16i8.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i1 = call @llvm.vp.trunc.nxv16i16.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i1 = call @llvm.vp.trunc.nxv16i32.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i1 = call @llvm.vp.trunc.nxv16i64.nxv16i1( undef, undef, i32 undef) + %nxv32i16_nxv32i8 = trunc undef to %nxv32i32_nxv32i8 = trunc undef to %nxv32i64_nxv32i8 = trunc undef to @@ -1492,6 +2943,17 @@ define void @trunc() { %nxv32i32_nxv32i1 = trunc undef to %nxv32i64_nxv32i1 = trunc undef to + %vp_nxv32i16_nxv32i8 = call @llvm.vp.trunc.nxv32i16.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i8 = call @llvm.vp.trunc.nxv32i32.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i8 = call @llvm.vp.trunc.nxv32i64.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i16 = call @llvm.vp.trunc.nxv32i32.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i16 = call @llvm.vp.trunc.nxv32i64.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i32 = call @llvm.vp.trunc.nxv32i64.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i1 = call @llvm.vp.trunc.nxv32i8.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i1 = call @llvm.vp.trunc.nxv32i16.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i1 = call @llvm.vp.trunc.nxv32i32.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i1 = call @llvm.vp.trunc.nxv32i64.nxv32i1( undef, undef, i32 undef) + %nxv64i16_nxv64i8 = trunc undef to %nxv64i32_nxv64i8 = trunc undef to %nxv64i64_nxv64i8 = trunc undef to @@ -1503,6 +2965,17 @@ define void @trunc() { %nxv64i32_nxv64i1 = trunc undef to %nxv64i64_nxv64i1 = trunc undef to + %vp_nxv64i16_nxv64i8 = call @llvm.vp.trunc.nxv64i16.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i8 = call @llvm.vp.trunc.nxv64i32.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i8 = call @llvm.vp.trunc.nxv64i64.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i16 = call @llvm.vp.trunc.nxv64i32.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i16 = call @llvm.vp.trunc.nxv64i64.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i32 = call @llvm.vp.trunc.nxv64i64.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i1 = call @llvm.vp.trunc.nxv64i8.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i1 = call @llvm.vp.trunc.nxv64i16.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i1 = call @llvm.vp.trunc.nxv64i32.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i1 = call @llvm.vp.trunc.nxv64i64.nxv64i1( undef, undef, i32 undef) + ret void } @@ -1511,103 +2984,201 @@ define void @fpext() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2f32 = fpext <2 x half> undef to <2 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f64 = fpext <2 x half> undef to <2 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f64 = fpext <2 x float> undef to <2 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4f32 = fpext <4 x half> undef to <4 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4f64 = fpext <4 x half> undef to <4 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4f64 = fpext <4 x float> undef to <4 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8f32 = fpext <8 x half> undef to <8 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16_v8f64 = fpext <8 x half> undef to <8 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8f64 = fpext <8 x float> undef to <8 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16_v16f32 = fpext <16 x half> undef to <16 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16_v16f64 = fpext <16 x half> undef to <16 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16f64 = fpext <16 x float> undef to <16 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32f16_v32f32 = fpext <32 x half> undef to <32 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32f16_v32f64 = fpext <32 x half> undef to <32 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32f64 = fpext <32 x float> undef to <32 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64f16_v64f32 = fpext <64 x half> undef to <64 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64f16_v64f64 = fpext <64 x half> undef to <64 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64f64 = fpext <64 x float> undef to <64 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v128f16_v128f32 = fpext <128 x half> undef to <128 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v128f16_v128f64 = fpext <128 x half> undef to <128 x double> ; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128f64 = fpext <128 x float> undef to <128 x double> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f16_nxv1f32 = call @llvm.vp.fpext.nxv1f32.nxv1f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f16_nxv1f64 = call @llvm.vp.fpext.nxv1f64.nxv1f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f64 = call @llvm.vp.fpext.nxv1f64.nxv1f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f16_nxv2f32 = call @llvm.vp.fpext.nxv2f32.nxv2f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f16_nxv2f64 = call @llvm.vp.fpext.nxv2f64.nxv2f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2f64 = call @llvm.vp.fpext.nxv2f64.nxv2f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4f16_nxv4f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f16_nxv4f32 = call @llvm.vp.fpext.nxv4f32.nxv4f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f16_nxv4f64 = call @llvm.vp.fpext.nxv4f64.nxv4f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4f64 = call @llvm.vp.fpext.nxv4f64.nxv4f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8f16_nxv8f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8f16_nxv8f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f16_nxv8f32 = call @llvm.vp.fpext.nxv8f32.nxv8f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f16_nxv8f64 = call @llvm.vp.fpext.nxv8f64.nxv8f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8f64 = call @llvm.vp.fpext.nxv8f64.nxv8f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16f16_nxv16f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f16_nxv16f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f16_nxv16f32 = call @llvm.vp.fpext.nxv16f32.nxv16f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f16_nxv16f64 = call @llvm.vp.fpext.nxv16f64.nxv16f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16f64 = call @llvm.vp.fpext.nxv16f64.nxv16f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32f16_nxv32f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32f16_nxv32f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32f16_nxv32f32 = call @llvm.vp.fpext.nxv32f32.nxv32f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f16_nxv32f64 = call @llvm.vp.fpext.nxv32f64.nxv32f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32f64 = call @llvm.vp.fpext.nxv32f64.nxv32f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64f32 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64f64 = fpext undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64f16_nxv64f32 = call @llvm.vp.fpext.nxv64f32.nxv64f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64f16_nxv64f64 = call @llvm.vp.fpext.nxv64f64.nxv64f16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64f64 = call @llvm.vp.fpext.nxv64f64.nxv64f32( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2f16_v2f32 = fpext <2 x half> undef to <2 x float> %v2f16_v2f64 = fpext <2 x half> undef to <2 x double> %v2f32_v2f64 = fpext <2 x float> undef to <2 x double> + %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2half.v2float(<2 x half> undef, <2 x i1> undef, i32 undef) + %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2half.v2double(<2 x half> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2float.v2double(<2 x float> undef, <2 x i1> undef, i32 undef) + %v4f16_v4f32 = fpext <4 x half> undef to <4 x float> %v4f16_v4f64 = fpext <4 x half> undef to <4 x double> %v4f32_v4f64 = fpext <4 x float> undef to <4 x double> + %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4half.v4float(<4 x half> undef, <4 x i1> undef, i32 undef) + %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4half.v4double(<4 x half> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4float.v4double(<4 x float> undef, <4 x i1> undef, i32 undef) + %v8f16_v8f32 = fpext <8 x half> undef to <8 x float> %v8f16_v8f64 = fpext <8 x half> undef to <8 x double> %v8f32_v8f64 = fpext <8 x float> undef to <8 x double> + %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8half.v8float(<8 x half> undef, <8 x i1> undef, i32 undef) + %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8half.v8double(<8 x half> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8float.v8double(<8 x float> undef, <8 x i1> undef, i32 undef) + %v16f16_v16f32 = fpext <16 x half> undef to <16 x float> %v16f16_v16f64 = fpext <16 x half> undef to <16 x double> %v16f32_v16f64 = fpext <16 x float> undef to <16 x double> + %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16half.v16float(<16 x half> undef, <16 x i1> undef, i32 undef) + %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16half.v16double(<16 x half> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16float.v16double(<16 x float> undef, <16 x i1> undef, i32 undef) + %v32f16_v32f32 = fpext <32 x half> undef to <32 x float> %v32f16_v32f64 = fpext <32 x half> undef to <32 x double> %v32f32_v32f64 = fpext <32 x float> undef to <32 x double> + %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32half.v32float(<32 x half> undef, <32 x i1> undef, i32 undef) + %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32half.v32double(<32 x half> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32float.v32double(<32 x float> undef, <32 x i1> undef, i32 undef) + %v64f16_v64f32 = fpext <64 x half> undef to <64 x float> %v64f16_v64f64 = fpext <64 x half> undef to <64 x double> %v64f32_v64f64 = fpext <64 x float> undef to <64 x double> + %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64half.v64float(<64 x half> undef, <64 x i1> undef, i32 undef) + %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64half.v64double(<64 x half> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64float.v64double(<64 x float> undef, <64 x i1> undef, i32 undef) + %v128f16_v128f32 = fpext <128 x half> undef to <128 x float> %v128f16_v128f64 = fpext <128 x half> undef to <128 x double> %v128f32_v128f64 = fpext <128 x float> undef to <128 x double> + %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128half.v128float(<128 x half> undef, <128 x i1> undef, i32 undef) + %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128half.v128double(<128 x half> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128float.v128double(<128 x float> undef, <128 x i1> undef, i32 undef) + %nxv1f16_nxv1f32 = fpext undef to %nxv1f16_nxv1f64 = fpext undef to %nxv1f32_nxv1f64 = fpext undef to + %vp_nxv1f16_nxv1f32 = call @llvm.vp.fpext.nxv1half.nxv1float( undef, undef, i32 undef) + %vp_nxv1f16_nxv1f64 = call @llvm.vp.fpext.nxv1half.nxv1double( undef, undef, i32 undef) + %vp_nxv1f32_nxv1f64 = call @llvm.vp.fpext.nxv1float.nxv1double( undef, undef, i32 undef) + %nxv2f16_nxv2f32 = fpext undef to %nxv2f16_nxv2f64 = fpext undef to %nxv2f32_nxv2f64 = fpext undef to + %vp_nxv2f16_nxv2f32 = call @llvm.vp.fpext.nxv2half.nxv2float( undef, undef, i32 undef) + %vp_nxv2f16_nxv2f64 = call @llvm.vp.fpext.nxv2half.nxv2double( undef, undef, i32 undef) + %vp_nxv2f32_nxv2f64 = call @llvm.vp.fpext.nxv2float.nxv2double( undef, undef, i32 undef) + %nxv4f16_nxv4f32 = fpext undef to %nxv4f16_nxv4f64 = fpext undef to %nxv4f32_nxv4f64 = fpext undef to + %vp_nxv4f16_nxv4f32 = call @llvm.vp.fpext.nxv4half.nxv4float( undef, undef, i32 undef) + %vp_nxv4f16_nxv4f64 = call @llvm.vp.fpext.nxv4half.nxv4double( undef, undef, i32 undef) + %vp_nxv4f32_nxv4f64 = call @llvm.vp.fpext.nxv4float.nxv4double( undef, undef, i32 undef) + %nxv8f16_nxv8f32 = fpext undef to %nxv8f16_nxv8f64 = fpext undef to %nxv8f32_nxv8f64 = fpext undef to + %vp_nxv8f16_nxv8f32 = call @llvm.vp.fpext.nxv8half.nxv8float( undef, undef, i32 undef) + %vp_nxv8f16_nxv8f64 = call @llvm.vp.fpext.nxv8half.nxv8double( undef, undef, i32 undef) + %vp_nxv8f32_nxv8f64 = call @llvm.vp.fpext.nxv8float.nxv8double( undef, undef, i32 undef) + %nxv16f16_nxv16f32 = fpext undef to %nxv16f16_nxv16f64 = fpext undef to %nxv16f32_nxv16f64 = fpext undef to + %vp_nxv16f16_nxv16f32 = call @llvm.vp.fpext.nxv16half.nxv16float( undef, undef, i32 undef) + %vp_nxv16f16_nxv16f64 = call @llvm.vp.fpext.nxv16half.nxv16double( undef, undef, i32 undef) + %vp_nxv16f32_nxv16f64 = call @llvm.vp.fpext.nxv16float.nxv16double( undef, undef, i32 undef) + %nxv32f16_nxv32f32 = fpext undef to %nxv32f16_nxv32f64 = fpext undef to %nxv32f32_nxv32f64 = fpext undef to + %vp_nxv32f16_nxv32f32 = call @llvm.vp.fpext.nxv32half.nxv32float( undef, undef, i32 undef) + %vp_nxv32f16_nxv32f64 = call @llvm.vp.fpext.nxv32half.nxv32double( undef, undef, i32 undef) + %vp_nxv32f32_nxv32f64 = call @llvm.vp.fpext.nxv32float.nxv32double( undef, undef, i32 undef) + %nxv64f16_nxv64f32 = fpext undef to %nxv64f16_nxv64f64 = fpext undef to %nxv64f32_nxv64f64 = fpext undef to + %vp_nxv64f16_nxv64f32 = call @llvm.vp.fpext.nxv64half.nxv64float( undef, undef, i32 undef) + %vp_nxv64f16_nxv64f64 = call @llvm.vp.fpext.nxv64half.nxv64double( undef, undef, i32 undef) + %vp_nxv64f32_nxv64f64 = call @llvm.vp.fpext.nxv64float.nxv64double( undef, undef, i32 undef) + ret void } @@ -1616,103 +3187,201 @@ define void @fptrunc() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half> ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float> +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f16 = call @llvm.vp.fptrunc.nxv1f16.nxv1f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1f16 = call @llvm.vp.fptrunc.nxv1f16.nxv1f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1f32 = call @llvm.vp.fptrunc.nxv1f32.nxv1f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv1f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv1f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv1f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2f16 = call @llvm.vp.fptrunc.nxv2f16.nxv2f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2f16 = call @llvm.vp.fptrunc.nxv2f16.nxv2f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2f32 = call @llvm.vp.fptrunc.nxv2f32.nxv2f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4f16 = call @llvm.vp.fptrunc.nxv4f16.nxv4f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4f16 = call @llvm.vp.fptrunc.nxv4f16.nxv4f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4f32 = call @llvm.vp.fptrunc.nxv4f32.nxv4f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8f16 = call @llvm.vp.fptrunc.nxv8f16.nxv8f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8f16 = call @llvm.vp.fptrunc.nxv8f16.nxv8f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8f32 = call @llvm.vp.fptrunc.nxv8f32.nxv8f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16f16 = call @llvm.vp.fptrunc.nxv16f16.nxv16f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16f16 = call @llvm.vp.fptrunc.nxv16f16.nxv16f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16f32 = call @llvm.vp.fptrunc.nxv16f32.nxv16f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32f16 = call @llvm.vp.fptrunc.nxv32f16.nxv32f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32f16 = call @llvm.vp.fptrunc.nxv32f16.nxv32f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32f32 = call @llvm.vp.fptrunc.nxv32f32.nxv32f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64f16 = fptrunc undef to ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64f16 = call @llvm.vp.fptrunc.nxv64f16.nxv64f32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64f16 = call @llvm.vp.fptrunc.nxv64f16.nxv64f64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64f32 = call @llvm.vp.fptrunc.nxv64f32.nxv64f64( undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half> %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half> %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float> + %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2float.v2half(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2double.v2half(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2double.v2float(<2 x double> undef, <2 x i1> undef, i32 undef) + %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half> %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half> %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float> + %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4float.v4half(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4double.v4half(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4double.v4float(<4 x double> undef, <4 x i1> undef, i32 undef) + %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half> %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half> %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float> + %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8float.v8half(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8double.v8half(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8double.v8float(<8 x double> undef, <8 x i1> undef, i32 undef) + %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half> %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half> %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float> + %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16float.v16half(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16double.v16half(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16double.v16float(<16 x double> undef, <16 x i1> undef, i32 undef) + %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half> %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half> %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float> + %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32float.v32half(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32double.v32half(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32double.v32float(<32 x double> undef, <32 x i1> undef, i32 undef) + %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half> %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half> %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float> + %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64float.v64half(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64double.v64half(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64double.v64float(<64 x double> undef, <64 x i1> undef, i32 undef) + %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half> %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half> %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float> + %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128float.v128half(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128double.v128half(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128double.v128float(<128 x double> undef, <128 x i1> undef, i32 undef) + %nxv1f32_nxv1f16 = fptrunc undef to %nxv1f64_nxv1f16 = fptrunc undef to %nxv1f64_nxv1f32 = fptrunc undef to + %vp_nxv1f32_nxv1f16 = call @llvm.vp.fptrunc.nxv1float.nxv1half( undef, undef, i32 undef) + %vp_nxv1f64_nxv1f16 = call @llvm.vp.fptrunc.nxv1double.nxv1half( undef, undef, i32 undef) + %vp_nxv1f64_nxv1f32 = call @llvm.vp.fptrunc.nxv1double.nxv1float( undef, undef, i32 undef) + %nxv2f32_nxv1f16 = fptrunc undef to %nxv2f64_nxv1f16 = fptrunc undef to %nxv2f64_nxv1f32 = fptrunc undef to + %vp_nxv2f32_nxv2f16 = call @llvm.vp.fptrunc.nxv2float.nxv2half( undef, undef, i32 undef) + %vp_nxv2f64_nxv2f16 = call @llvm.vp.fptrunc.nxv2double.nxv2half( undef, undef, i32 undef) + %vp_nxv2f64_nxv2f32 = call @llvm.vp.fptrunc.nxv2double.nxv2float( undef, undef, i32 undef) + %nxv4f32_nxv4f16 = fptrunc undef to %nxv4f64_nxv4f16 = fptrunc undef to %nxv4f64_nxv4f32 = fptrunc undef to + %vp_nxv4f32_nxv4f16 = call @llvm.vp.fptrunc.nxv4float.nxv4half( undef, undef, i32 undef) + %vp_nxv4f64_nxv4f16 = call @llvm.vp.fptrunc.nxv4double.nxv4half( undef, undef, i32 undef) + %vp_nxv4f64_nxv4f32 = call @llvm.vp.fptrunc.nxv4double.nxv4float( undef, undef, i32 undef) + %nxv8f32_nxv8f16 = fptrunc undef to %nxv8f64_nxv8f16 = fptrunc undef to %nxv8f64_nxv8f32 = fptrunc undef to + %vp_nxv8f32_nxv8f16 = call @llvm.vp.fptrunc.nxv8float.nxv8half( undef, undef, i32 undef) + %vp_nxv8f64_nxv8f16 = call @llvm.vp.fptrunc.nxv8double.nxv8half( undef, undef, i32 undef) + %vp_nxv8f64_nxv8f32 = call @llvm.vp.fptrunc.nxv8double.nxv8float( undef, undef, i32 undef) + %nxv16f32_nxv16f16 = fptrunc undef to %nxv16f64_nxv16f16 = fptrunc undef to %nxv16f64_nxv16f32 = fptrunc undef to + %vp_nxv16f32_nxv16f16 = call @llvm.vp.fptrunc.nxv16float.nxv16half( undef, undef, i32 undef) + %vp_nxv16f64_nxv16f16 = call @llvm.vp.fptrunc.nxv16double.nxv16half( undef, undef, i32 undef) + %vp_nxv16f64_nxv16f32 = call @llvm.vp.fptrunc.nxv16double.nxv16float( undef, undef, i32 undef) + %nxv32f32_nxv32f16 = fptrunc undef to %nxv32f64_nxv32f16 = fptrunc undef to %nxv32f64_nxv32f32 = fptrunc undef to + %vp_nxv32f32_nxv32f16 = call @llvm.vp.fptrunc.nxv32float.nxv32half( undef, undef, i32 undef) + %vp_nxv32f64_nxv32f16 = call @llvm.vp.fptrunc.nxv32double.nxv32half( undef, undef, i32 undef) + %vp_nxv32f64_nxv32f32 = call @llvm.vp.fptrunc.nxv32double.nxv32float( undef, undef, i32 undef) + %nxv64f32_nxv64f16 = fptrunc undef to %nxv64f64_nxv64f16 = fptrunc undef to %nxv64f64_nxv64f32 = fptrunc undef to + %vp_nxv64f32_nxv64f16 = call @llvm.vp.fptrunc.nxv64float.nxv64half( undef, undef, i32 undef) + %vp_nxv64f64_nxv64f16 = call @llvm.vp.fptrunc.nxv64double.nxv64half( undef, undef, i32 undef) + %vp_nxv64f64_nxv64f32 = call @llvm.vp.fptrunc.nxv64double.nxv64float( undef, undef, i32 undef) + ret void } @@ -1728,6 +3397,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> @@ -1738,6 +3417,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> @@ -1748,6 +3437,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> @@ -1758,6 +3457,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> @@ -1768,6 +3477,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> @@ -1778,6 +3497,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> @@ -1788,6 +3517,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi undef to @@ -1798,6 +3537,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi undef to @@ -1808,6 +3557,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi undef to @@ -1818,6 +3577,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi undef to @@ -1828,6 +3597,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi undef to @@ -1838,6 +3617,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi undef to @@ -1848,6 +3637,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi undef to @@ -1858,6 +3657,16 @@ define void @fptosi() { ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'fptosi' @@ -1871,6 +3680,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> @@ -1881,6 +3700,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> @@ -1891,6 +3720,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> @@ -1901,6 +3740,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> @@ -1911,6 +3760,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> @@ -1921,6 +3780,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> @@ -1931,6 +3800,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi undef to @@ -1941,6 +3820,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi undef to @@ -1951,6 +3840,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi undef to @@ -1961,6 +3860,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi undef to @@ -1971,6 +3880,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi undef to @@ -1981,6 +3900,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi undef to @@ -1991,6 +3920,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi undef to @@ -2001,6 +3940,16 @@ define void @fptosi() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8> @@ -2014,6 +3963,17 @@ define void @fptosi() { %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> + %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef) + %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> @@ -2025,6 +3985,17 @@ define void @fptosi() { %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> + %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef) + %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> @@ -2036,6 +4007,17 @@ define void @fptosi() { %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> + %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef) + %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> @@ -2047,6 +4029,17 @@ define void @fptosi() { %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> + %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef) + %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> @@ -2058,6 +4051,17 @@ define void @fptosi() { %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> + %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef) + %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> @@ -2069,6 +4073,17 @@ define void @fptosi() { %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> + %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef) + %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> @@ -2080,6 +4095,17 @@ define void @fptosi() { %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> + %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef) + %nxv1f32_nxv1i8 = fptosi undef to %nxv1f64_nxv1i8 = fptosi undef to %nxv1f32_nxv1i16 = fptosi undef to @@ -2091,6 +4117,17 @@ define void @fptosi() { %nxv1f32_nxv1i1 = fptosi undef to %nxv1f64_nxv1i1 = fptosi undef to + %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptosi.nxv1float.nxv1i8( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptosi.nxv1double.nxv1i8( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptosi.nxv1float.nxv1i16( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptosi.nxv1double.nxv1i16( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptosi.nxv1float.nxv1i32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptosi.nxv1double.nxv1i32( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptosi.nxv1float.nxv1i64( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptosi.nxv1double.nxv1i64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptosi.nxv1float.nxv1i1( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptosi.nxv1double.nxv1i1( undef, undef, i32 undef) + %nxv2f32_nxv2i8 = fptosi undef to %nxv2f64_nxv2i8 = fptosi undef to %nxv2f32_nxv2i16 = fptosi undef to @@ -2102,6 +4139,17 @@ define void @fptosi() { %nxv2f32_nxv2i1 = fptosi undef to %nxv2f64_nxv2i1 = fptosi undef to + %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptosi.nxv2float.nxv2i8( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptosi.nxv2double.nxv2i8( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptosi.nxv2float.nxv2i16( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptosi.nxv2double.nxv2i16( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptosi.nxv2float.nxv2i32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptosi.nxv2double.nxv2i32( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptosi.nxv2float.nxv2i64( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptosi.nxv2double.nxv2i64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptosi.nxv2float.nxv2i1( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptosi.nxv2double.nxv2i1( undef, undef, i32 undef) + %nxv4f32_nxv4i8 = fptosi undef to %nxv4f64_nxv4i8 = fptosi undef to %nxv4f32_nxv4i16 = fptosi undef to @@ -2113,6 +4161,17 @@ define void @fptosi() { %nxv4f32_nxv4i1 = fptosi undef to %nxv4f64_nxv4i1 = fptosi undef to + %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptosi.nxv4float.nxv4i8( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptosi.nxv4double.nxv4i8( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptosi.nxv4float.nxv4i16( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptosi.nxv4double.nxv4i16( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptosi.nxv4float.nxv4i32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptosi.nxv4double.nxv4i32( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptosi.nxv4float.nxv4i64( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptosi.nxv4double.nxv4i64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptosi.nxv4float.nxv4i1( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptosi.nxv4double.nxv4i1( undef, undef, i32 undef) + %nxv8f32_nxv8i8 = fptosi undef to %nxv8f64_nxv8i8 = fptosi undef to %nxv8f32_nxv8i16 = fptosi undef to @@ -2124,6 +4183,17 @@ define void @fptosi() { %nxv8f32_nxv8i1 = fptosi undef to %nxv8f64_nxv8i1 = fptosi undef to + %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptosi.nxv8float.nxv8i8( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptosi.nxv8double.nxv8i8( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptosi.nxv8float.nxv8i16( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptosi.nxv8double.nxv8i16( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptosi.nxv8float.nxv8i32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptosi.nxv8double.nxv8i32( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptosi.nxv8float.nxv8i64( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptosi.nxv8double.nxv8i64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptosi.nxv8float.nxv8i1( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptosi.nxv8double.nxv8i1( undef, undef, i32 undef) + %nxv16f32_nxv16i8 = fptosi undef to %nxv16f64_nxv16i8 = fptosi undef to %nxv16f32_nxv16i16 = fptosi undef to @@ -2135,6 +4205,17 @@ define void @fptosi() { %nxv16f32_nxv16i1 = fptosi undef to %nxv16f64_nxv16i1 = fptosi undef to + %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptosi.nxv16float.nxv16i8( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptosi.nxv16double.nxv16i8( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptosi.nxv16float.nxv16i16( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptosi.nxv16double.nxv16i16( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptosi.nxv16float.nxv16i32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptosi.nxv16double.nxv16i32( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptosi.nxv16float.nxv16i64( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptosi.nxv16double.nxv16i64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptosi.nxv16float.nxv16i1( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptosi.nxv16double.nxv16i1( undef, undef, i32 undef) + %nxv32f32_nxv32i8 = fptosi undef to %nxv32f64_nxv32i8 = fptosi undef to %nxv32f32_nxv32i16 = fptosi undef to @@ -2146,6 +4227,17 @@ define void @fptosi() { %nxv32f32_nxv32i1 = fptosi undef to %nxv32f64_nxv32i1 = fptosi undef to + %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptosi.nxv32float.nxv32i8( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptosi.nxv32double.nxv32i8( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptosi.nxv32float.nxv32i16( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptosi.nxv32double.nxv32i16( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptosi.nxv32float.nxv32i32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptosi.nxv32double.nxv32i32( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptosi.nxv32float.nxv32i64( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptosi.nxv32double.nxv32i64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptosi.nxv32float.nxv32i1( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptosi.nxv32double.nxv32i1( undef, undef, i32 undef) + %nxv64f32_nxv64i8 = fptosi undef to %nxv64f64_nxv64i8 = fptosi undef to %nxv64f32_nxv64i16 = fptosi undef to @@ -2157,6 +4249,17 @@ define void @fptosi() { %nxv64f32_nxv64i1 = fptosi undef to %nxv64f64_nxv64i1 = fptosi undef to + %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptosi.nxv64float.nxv64i8( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptosi.nxv64double.nxv64i8( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptosi.nxv64float.nxv64i16( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptosi.nxv64double.nxv64i16( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptosi.nxv64float.nxv64i32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptosi.nxv64double.nxv64i32( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptosi.nxv64float.nxv64i64( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptosi.nxv64double.nxv64i64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptosi.nxv64float.nxv64i1( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptosi.nxv64double.nxv64i1( undef, undef, i32 undef) + ret void } @@ -2172,6 +4275,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> @@ -2182,6 +4295,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> @@ -2192,6 +4315,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> @@ -2202,6 +4335,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> @@ -2212,6 +4355,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> @@ -2222,6 +4375,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> @@ -2232,6 +4395,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui undef to @@ -2242,6 +4415,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui undef to @@ -2252,6 +4435,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui undef to @@ -2262,6 +4455,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui undef to @@ -2272,6 +4475,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui undef to @@ -2282,6 +4495,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui undef to @@ -2292,6 +4515,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui undef to @@ -2302,6 +4535,16 @@ define void @fptoui() { ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f64( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'fptoui' @@ -2315,6 +4558,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> @@ -2325,6 +4578,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> @@ -2335,6 +4598,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> @@ -2345,6 +4618,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> @@ -2355,6 +4638,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> @@ -2365,6 +4658,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> @@ -2375,6 +4678,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui undef to @@ -2385,6 +4698,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui undef to @@ -2395,6 +4718,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui undef to @@ -2405,6 +4738,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui undef to @@ -2415,6 +4758,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui undef to @@ -2425,6 +4778,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui undef to @@ -2435,6 +4798,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui undef to @@ -2445,6 +4818,16 @@ define void @fptoui() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f64( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8> @@ -2458,6 +4841,17 @@ define void @fptoui() { %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> + %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef) + %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> @@ -2469,6 +4863,17 @@ define void @fptoui() { %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> + %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef) + %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> @@ -2480,6 +4885,17 @@ define void @fptoui() { %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> + %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef) + %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> @@ -2491,6 +4907,17 @@ define void @fptoui() { %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> + %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef) + %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> @@ -2502,6 +4929,17 @@ define void @fptoui() { %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> + %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef) + %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> @@ -2513,6 +4951,17 @@ define void @fptoui() { %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> + %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef) + %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> @@ -2524,6 +4973,17 @@ define void @fptoui() { %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> + %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef) + %nxv1f32_nxv1i8 = fptoui undef to %nxv1f64_nxv1i8 = fptoui undef to %nxv1f32_nxv1i16 = fptoui undef to @@ -2535,6 +4995,17 @@ define void @fptoui() { %nxv1f32_nxv1i1 = fptoui undef to %nxv1f64_nxv1i1 = fptoui undef to + %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptoui.nxv1float.nxv1i8( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptoui.nxv1double.nxv1i8( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptoui.nxv1float.nxv1i16( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptoui.nxv1double.nxv1i16( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptoui.nxv1float.nxv1i32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptoui.nxv1double.nxv1i32( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptoui.nxv1float.nxv1i64( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptoui.nxv1double.nxv1i64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptoui.nxv1float.nxv1i1( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptoui.nxv1double.nxv1i1( undef, undef, i32 undef) + %nxv2f32_nxv2i8 = fptoui undef to %nxv2f64_nxv2i8 = fptoui undef to %nxv2f32_nxv2i16 = fptoui undef to @@ -2546,6 +5017,17 @@ define void @fptoui() { %nxv2f32_nxv2i1 = fptoui undef to %nxv2f64_nxv2i1 = fptoui undef to + %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptoui.nxv2float.nxv2i8( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptoui.nxv2double.nxv2i8( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptoui.nxv2float.nxv2i16( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptoui.nxv2double.nxv2i16( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptoui.nxv2float.nxv2i32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptoui.nxv2double.nxv2i32( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptoui.nxv2float.nxv2i64( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptoui.nxv2double.nxv2i64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptoui.nxv2float.nxv2i1( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptoui.nxv2double.nxv2i1( undef, undef, i32 undef) + %nxv4f32_nxv4i8 = fptoui undef to %nxv4f64_nxv4i8 = fptoui undef to %nxv4f32_nxv4i16 = fptoui undef to @@ -2557,6 +5039,17 @@ define void @fptoui() { %nxv4f32_nxv4i1 = fptoui undef to %nxv4f64_nxv4i1 = fptoui undef to + %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptoui.nxv4float.nxv4i8( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptoui.nxv4double.nxv4i8( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptoui.nxv4float.nxv4i16( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptoui.nxv4double.nxv4i16( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptoui.nxv4float.nxv4i32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptoui.nxv4double.nxv4i32( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptoui.nxv4float.nxv4i64( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptoui.nxv4double.nxv4i64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptoui.nxv4float.nxv4i1( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptoui.nxv4double.nxv4i1( undef, undef, i32 undef) + %nxv8f32_nxv8i8 = fptoui undef to %nxv8f64_nxv8i8 = fptoui undef to %nxv8f32_nxv8i16 = fptoui undef to @@ -2568,6 +5061,17 @@ define void @fptoui() { %nxv8f32_nxv8i1 = fptoui undef to %nxv8f64_nxv8i1 = fptoui undef to + %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptoui.nxv8float.nxv8i8( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptoui.nxv8double.nxv8i8( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptoui.nxv8float.nxv8i16( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptoui.nxv8double.nxv8i16( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptoui.nxv8float.nxv8i32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptoui.nxv8double.nxv8i32( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptoui.nxv8float.nxv8i64( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptoui.nxv8double.nxv8i64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptoui.nxv8float.nxv8i1( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptoui.nxv8double.nxv8i1( undef, undef, i32 undef) + %nxv16f32_nxv16i8 = fptoui undef to %nxv16f64_nxv16i8 = fptoui undef to %nxv16f32_nxv16i16 = fptoui undef to @@ -2579,6 +5083,17 @@ define void @fptoui() { %nxv16f32_nxv16i1 = fptoui undef to %nxv16f64_nxv16i1 = fptoui undef to + %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptoui.nxv16float.nxv16i8( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptoui.nxv16double.nxv16i8( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptoui.nxv16float.nxv16i16( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptoui.nxv16double.nxv16i16( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptoui.nxv16float.nxv16i32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptoui.nxv16double.nxv16i32( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptoui.nxv16float.nxv16i64( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptoui.nxv16double.nxv16i64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptoui.nxv16float.nxv16i1( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptoui.nxv16double.nxv16i1( undef, undef, i32 undef) + %nxv32f32_nxv32i8 = fptoui undef to %nxv32f64_nxv32i8 = fptoui undef to %nxv32f32_nxv32i16 = fptoui undef to @@ -2590,6 +5105,17 @@ define void @fptoui() { %nxv32f32_nxv32i1 = fptoui undef to %nxv32f64_nxv32i1 = fptoui undef to + %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptoui.nxv32float.nxv32i8( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptoui.nxv32double.nxv32i8( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptoui.nxv32float.nxv32i16( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptoui.nxv32double.nxv32i16( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptoui.nxv32float.nxv32i32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptoui.nxv32double.nxv32i32( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptoui.nxv32float.nxv32i64( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptoui.nxv32double.nxv32i64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptoui.nxv32float.nxv32i1( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptoui.nxv32double.nxv32i1( undef, undef, i32 undef) + %nxv64f32_nxv64i8 = fptoui undef to %nxv64f64_nxv64i8 = fptoui undef to %nxv64f32_nxv64i16 = fptoui undef to @@ -2601,6 +5127,17 @@ define void @fptoui() { %nxv64f32_nxv64i1 = fptoui undef to %nxv64f64_nxv64i1 = fptoui undef to + %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptoui.nxv64float.nxv64i8( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptoui.nxv64double.nxv64i8( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptoui.nxv64float.nxv64i16( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptoui.nxv64double.nxv64i16( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptoui.nxv64float.nxv64i32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptoui.nxv64double.nxv64i32( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptoui.nxv64float.nxv64i64( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptoui.nxv64double.nxv64i64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptoui.nxv64float.nxv64i1( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptoui.nxv64double.nxv64i1( undef, undef, i32 undef) + ret void } @@ -2616,6 +5153,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> @@ -2626,6 +5173,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> @@ -2636,6 +5193,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> @@ -2646,6 +5213,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> @@ -2656,6 +5233,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> @@ -2666,6 +5253,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> @@ -2676,6 +5273,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp undef to @@ -2686,6 +5293,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp undef to @@ -2696,6 +5313,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp undef to @@ -2706,6 +5333,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp undef to @@ -2716,6 +5353,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp undef to @@ -2726,6 +5373,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp undef to @@ -2736,6 +5393,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp undef to @@ -2746,6 +5413,16 @@ define void @sitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'sitofp' @@ -2759,6 +5436,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> @@ -2769,6 +5456,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> @@ -2779,6 +5476,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> @@ -2789,6 +5496,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> @@ -2799,6 +5516,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> @@ -2809,6 +5536,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> @@ -2819,6 +5556,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp undef to @@ -2829,6 +5576,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp undef to @@ -2839,6 +5596,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp undef to @@ -2849,6 +5616,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp undef to @@ -2859,6 +5636,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp undef to @@ -2869,6 +5656,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp undef to @@ -2879,6 +5676,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp undef to @@ -2889,6 +5696,16 @@ define void @sitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> @@ -2902,6 +5719,17 @@ define void @sitofp() { %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> + %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef) + %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> @@ -2913,6 +5741,17 @@ define void @sitofp() { %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> + %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef) + %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> @@ -2924,6 +5763,17 @@ define void @sitofp() { %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> + %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef) + %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> @@ -2935,6 +5785,17 @@ define void @sitofp() { %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> + %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef) + %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> @@ -2946,6 +5807,17 @@ define void @sitofp() { %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> + %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef) + %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> @@ -2957,6 +5829,17 @@ define void @sitofp() { %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> + %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef) + %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> @@ -2968,6 +5851,17 @@ define void @sitofp() { %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> + %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef) + %nxv1i8_nxv1f32 = sitofp undef to %nxv1i8_nxv1f64 = sitofp undef to %nxv1i16_nxv1f32 = sitofp undef to @@ -2979,6 +5873,17 @@ define void @sitofp() { %nxv1i1_nxv1f32 = sitofp undef to %nxv1i1_nxv1f64 = sitofp undef to + %vp_nxv1fi8_nxv1f32 = call @llvm.vp.sitofp.nxv1i8.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f64 = call @llvm.vp.sitofp.nxv1i8.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f32 = call @llvm.vp.sitofp.nxv1i16.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f64 = call @llvm.vp.sitofp.nxv1i16.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f32 = call @llvm.vp.sitofp.nxv1i32.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f64 = call @llvm.vp.sitofp.nxv1i32.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f32 = call @llvm.vp.sitofp.nxv1i64.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f64 = call @llvm.vp.sitofp.nxv1i64.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f32 = call @llvm.vp.sitofp.nxv1i1.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f64 = call @llvm.vp.sitofp.nxv1i1.nxv1double( undef, undef, i32 undef) + %nxv2i8_nxv2f32 = sitofp undef to %nxv2i8_nxv2f64 = sitofp undef to %nxv2i16_nxv2f32 = sitofp undef to @@ -2990,6 +5895,17 @@ define void @sitofp() { %nxv2i1_nxv2f32 = sitofp undef to %nxv2i1_nxv2f64 = sitofp undef to + %vp_nxv2fi8_nxv2f32 = call @llvm.vp.sitofp.nxv2i8.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f64 = call @llvm.vp.sitofp.nxv2i8.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f32 = call @llvm.vp.sitofp.nxv2i16.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f64 = call @llvm.vp.sitofp.nxv2i16.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f32 = call @llvm.vp.sitofp.nxv2i32.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f64 = call @llvm.vp.sitofp.nxv2i32.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f32 = call @llvm.vp.sitofp.nxv2i64.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f64 = call @llvm.vp.sitofp.nxv2i64.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f32 = call @llvm.vp.sitofp.nxv2i1.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f64 = call @llvm.vp.sitofp.nxv2i1.nxv2double( undef, undef, i32 undef) + %nxv4i8_nxv4f32 = sitofp undef to %nxv4i8_nxv4f64 = sitofp undef to %nxv4i16_nxv4f32 = sitofp undef to @@ -3001,6 +5917,17 @@ define void @sitofp() { %nxv4i1_nxv4f32 = sitofp undef to %nxv4i1_nxv4f64 = sitofp undef to + %vp_nxv4fi8_nxv4f32 = call @llvm.vp.sitofp.nxv4i8.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f64 = call @llvm.vp.sitofp.nxv4i8.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f32 = call @llvm.vp.sitofp.nxv4i16.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f64 = call @llvm.vp.sitofp.nxv4i16.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f32 = call @llvm.vp.sitofp.nxv4i32.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f64 = call @llvm.vp.sitofp.nxv4i32.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f32 = call @llvm.vp.sitofp.nxv4i64.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f64 = call @llvm.vp.sitofp.nxv4i64.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f32 = call @llvm.vp.sitofp.nxv4i1.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f64 = call @llvm.vp.sitofp.nxv4i1.nxv4double( undef, undef, i32 undef) + %nxv8i8_nxv8f32 = sitofp undef to %nxv8i8_nxv8f64 = sitofp undef to %nxv8i16_nxv8f32 = sitofp undef to @@ -3012,6 +5939,17 @@ define void @sitofp() { %nxv8i1_nxv8f32 = sitofp undef to %nxv8i1_nxv8f64 = sitofp undef to + %vp_nxv8fi8_nxv8f32 = call @llvm.vp.sitofp.nxv8i8.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f64 = call @llvm.vp.sitofp.nxv8i8.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f32 = call @llvm.vp.sitofp.nxv8i16.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f64 = call @llvm.vp.sitofp.nxv8i16.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f32 = call @llvm.vp.sitofp.nxv8i32.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f64 = call @llvm.vp.sitofp.nxv8i32.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f32 = call @llvm.vp.sitofp.nxv8i64.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f64 = call @llvm.vp.sitofp.nxv8i64.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f32 = call @llvm.vp.sitofp.nxv8i1.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f64 = call @llvm.vp.sitofp.nxv8i1.nxv8double( undef, undef, i32 undef) + %nxv16i8_nxv16f32 = sitofp undef to %nxv16i8_nxv16f64 = sitofp undef to %nxv16i16_nxv16f32 = sitofp undef to @@ -3023,6 +5961,17 @@ define void @sitofp() { %nxv16i1_nxv16f32 = sitofp undef to %nxv16i1_nxv16f64 = sitofp undef to + %vp_nxv16fi8_nxv16f32 = call @llvm.vp.sitofp.nxv16i8.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f64 = call @llvm.vp.sitofp.nxv16i8.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f32 = call @llvm.vp.sitofp.nxv16i16.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f64 = call @llvm.vp.sitofp.nxv16i16.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f32 = call @llvm.vp.sitofp.nxv16i32.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f64 = call @llvm.vp.sitofp.nxv16i32.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f32 = call @llvm.vp.sitofp.nxv16i64.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f64 = call @llvm.vp.sitofp.nxv16i64.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f32 = call @llvm.vp.sitofp.nxv16i1.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f64 = call @llvm.vp.sitofp.nxv16i1.nxv16double( undef, undef, i32 undef) + %nxv32i8_nxv32f32 = sitofp undef to %nxv32i8_nxv32f64 = sitofp undef to %nxv32i16_nxv32f32 = sitofp undef to @@ -3034,6 +5983,17 @@ define void @sitofp() { %nxv32i1_nxv32f32 = sitofp undef to %nxv32i1_nxv32f64 = sitofp undef to + %vp_nxv32fi8_nxv32f32 = call @llvm.vp.sitofp.nxv32i8.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f64 = call @llvm.vp.sitofp.nxv32i8.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f32 = call @llvm.vp.sitofp.nxv32i16.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f64 = call @llvm.vp.sitofp.nxv32i16.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f32 = call @llvm.vp.sitofp.nxv32i32.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f64 = call @llvm.vp.sitofp.nxv32i32.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f32 = call @llvm.vp.sitofp.nxv32i64.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f64 = call @llvm.vp.sitofp.nxv32i64.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f32 = call @llvm.vp.sitofp.nxv32i1.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f64 = call @llvm.vp.sitofp.nxv32i1.nxv32double( undef, undef, i32 undef) + %nxv64i8_nxv64f32 = sitofp undef to %nxv64i8_nxv64f64 = sitofp undef to %nxv64i16_nxv64f32 = sitofp undef to @@ -3045,6 +6005,17 @@ define void @sitofp() { %nxv64i1_nxv64f32 = sitofp undef to %nxv64i1_nxv64f64 = sitofp undef to + %vp_nxv64fi8_nxv64f32 = call @llvm.vp.sitofp.nxv64i8.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f64 = call @llvm.vp.sitofp.nxv64i8.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f32 = call @llvm.vp.sitofp.nxv64i16.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f64 = call @llvm.vp.sitofp.nxv64i16.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f32 = call @llvm.vp.sitofp.nxv64i32.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f64 = call @llvm.vp.sitofp.nxv64i32.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f32 = call @llvm.vp.sitofp.nxv64i64.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f64 = call @llvm.vp.sitofp.nxv64i64.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f32 = call @llvm.vp.sitofp.nxv64i1.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f64 = call @llvm.vp.sitofp.nxv64i1.nxv64double( undef, undef, i32 undef) + ret void } @@ -3060,6 +6031,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> @@ -3070,6 +6051,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> @@ -3080,6 +6071,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> @@ -3090,6 +6091,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> @@ -3100,6 +6111,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> @@ -3110,6 +6131,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> @@ -3120,6 +6151,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double> ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp undef to @@ -3130,6 +6171,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp undef to @@ -3140,6 +6191,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp undef to @@ -3150,6 +6211,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp undef to @@ -3160,6 +6231,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp undef to @@ -3170,6 +6251,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp undef to @@ -3180,6 +6271,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp undef to @@ -3190,6 +6291,16 @@ define void @uitofp() { ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'uitofp' @@ -3203,6 +6314,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> @@ -3213,6 +6334,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> @@ -3223,6 +6354,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> @@ -3233,6 +6374,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> @@ -3243,6 +6394,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> @@ -3253,6 +6414,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> @@ -3263,6 +6434,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double> ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp undef to @@ -3273,6 +6454,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp undef to @@ -3283,6 +6474,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp undef to @@ -3293,6 +6494,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp undef to @@ -3303,6 +6514,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp undef to @@ -3313,6 +6534,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp undef to @@ -3323,6 +6554,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp undef to @@ -3333,6 +6574,16 @@ define void @uitofp() { ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> @@ -3346,6 +6597,17 @@ define void @uitofp() { %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> + %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef) + %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> @@ -3357,6 +6619,17 @@ define void @uitofp() { %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> + %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef) + %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> @@ -3368,6 +6641,17 @@ define void @uitofp() { %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> + %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef) + %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> @@ -3379,6 +6663,17 @@ define void @uitofp() { %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> + %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef) + %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> @@ -3390,6 +6685,17 @@ define void @uitofp() { %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> + %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef) + %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> @@ -3401,6 +6707,17 @@ define void @uitofp() { %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> + %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef) + %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> @@ -3412,6 +6729,17 @@ define void @uitofp() { %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> + %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef) + %nxv1i8_nxv1f32 = uitofp undef to %nxv1i8_nxv1f64 = uitofp undef to %nxv1i16_nxv1f32 = uitofp undef to @@ -3423,6 +6751,17 @@ define void @uitofp() { %nxv1i1_nxv1f32 = uitofp undef to %nxv1i1_nxv1f64 = uitofp undef to + %vp_nxv1fi8_nxv1f32 = call @llvm.vp.uitofp.nxv1i8.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f64 = call @llvm.vp.uitofp.nxv1i8.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f32 = call @llvm.vp.uitofp.nxv1i16.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f64 = call @llvm.vp.uitofp.nxv1i16.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f32 = call @llvm.vp.uitofp.nxv1i32.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f64 = call @llvm.vp.uitofp.nxv1i32.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f32 = call @llvm.vp.uitofp.nxv1i64.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f64 = call @llvm.vp.uitofp.nxv1i64.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f32 = call @llvm.vp.uitofp.nxv1i1.nxv1float( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f64 = call @llvm.vp.uitofp.nxv1i1.nxv1double( undef, undef, i32 undef) + %nxv2i8_nxv2f32 = uitofp undef to %nxv2i8_nxv2f64 = uitofp undef to %nxv2i16_nxv2f32 = uitofp undef to @@ -3434,6 +6773,17 @@ define void @uitofp() { %nxv2i1_nxv2f32 = uitofp undef to %nxv2i1_nxv2f64 = uitofp undef to + %vp_nxv2fi8_nxv2f32 = call @llvm.vp.uitofp.nxv2i8.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f64 = call @llvm.vp.uitofp.nxv2i8.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f32 = call @llvm.vp.uitofp.nxv2i16.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f64 = call @llvm.vp.uitofp.nxv2i16.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f32 = call @llvm.vp.uitofp.nxv2i32.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f64 = call @llvm.vp.uitofp.nxv2i32.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f32 = call @llvm.vp.uitofp.nxv2i64.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f64 = call @llvm.vp.uitofp.nxv2i64.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f32 = call @llvm.vp.uitofp.nxv2i1.nxv2float( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f64 = call @llvm.vp.uitofp.nxv2i1.nxv2double( undef, undef, i32 undef) + %nxv4i8_nxv4f32 = uitofp undef to %nxv4i8_nxv4f64 = uitofp undef to %nxv4i16_nxv4f32 = uitofp undef to @@ -3445,6 +6795,17 @@ define void @uitofp() { %nxv4i1_nxv4f32 = uitofp undef to %nxv4i1_nxv4f64 = uitofp undef to + %vp_nxv4fi8_nxv4f32 = call @llvm.vp.uitofp.nxv4i8.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f64 = call @llvm.vp.uitofp.nxv4i8.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f32 = call @llvm.vp.uitofp.nxv4i16.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f64 = call @llvm.vp.uitofp.nxv4i16.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f32 = call @llvm.vp.uitofp.nxv4i32.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f64 = call @llvm.vp.uitofp.nxv4i32.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f32 = call @llvm.vp.uitofp.nxv4i64.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f64 = call @llvm.vp.uitofp.nxv4i64.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f32 = call @llvm.vp.uitofp.nxv4i1.nxv4float( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f64 = call @llvm.vp.uitofp.nxv4i1.nxv4double( undef, undef, i32 undef) + %nxv8i8_nxv8f32 = uitofp undef to %nxv8i8_nxv8f64 = uitofp undef to %nxv8i16_nxv8f32 = uitofp undef to @@ -3456,6 +6817,17 @@ define void @uitofp() { %nxv8i1_nxv8f32 = uitofp undef to %nxv8i1_nxv8f64 = uitofp undef to + %vp_nxv8fi8_nxv8f32 = call @llvm.vp.uitofp.nxv8i8.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f64 = call @llvm.vp.uitofp.nxv8i8.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f32 = call @llvm.vp.uitofp.nxv8i16.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f64 = call @llvm.vp.uitofp.nxv8i16.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f32 = call @llvm.vp.uitofp.nxv8i32.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f64 = call @llvm.vp.uitofp.nxv8i32.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f32 = call @llvm.vp.uitofp.nxv8i64.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f64 = call @llvm.vp.uitofp.nxv8i64.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f32 = call @llvm.vp.uitofp.nxv8i1.nxv8float( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f64 = call @llvm.vp.uitofp.nxv8i1.nxv8double( undef, undef, i32 undef) + %nxv16i8_nxv16f32 = uitofp undef to %nxv16i8_nxv16f64 = uitofp undef to %nxv16i16_nxv16f32 = uitofp undef to @@ -3467,6 +6839,17 @@ define void @uitofp() { %nxv16i1_nxv16f32 = uitofp undef to %nxv16i1_nxv16f64 = uitofp undef to + %vp_nxv16fi8_nxv16f32 = call @llvm.vp.uitofp.nxv16i8.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f64 = call @llvm.vp.uitofp.nxv16i8.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f32 = call @llvm.vp.uitofp.nxv16i16.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f64 = call @llvm.vp.uitofp.nxv16i16.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f32 = call @llvm.vp.uitofp.nxv16i32.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f64 = call @llvm.vp.uitofp.nxv16i32.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f32 = call @llvm.vp.uitofp.nxv16i64.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f64 = call @llvm.vp.uitofp.nxv16i64.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f32 = call @llvm.vp.uitofp.nxv16i1.nxv16float( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f64 = call @llvm.vp.uitofp.nxv16i1.nxv16double( undef, undef, i32 undef) + %nxv32i8_nxv32f32 = uitofp undef to %nxv32i8_nxv32f64 = uitofp undef to %nxv32i16_nxv32f32 = uitofp undef to @@ -3478,6 +6861,17 @@ define void @uitofp() { %nxv32i1_nxv32f32 = uitofp undef to %nxv32i1_nxv32f64 = uitofp undef to + %vp_nxv32fi8_nxv32f32 = call @llvm.vp.uitofp.nxv32i8.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f64 = call @llvm.vp.uitofp.nxv32i8.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f32 = call @llvm.vp.uitofp.nxv32i16.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f64 = call @llvm.vp.uitofp.nxv32i16.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f32 = call @llvm.vp.uitofp.nxv32i32.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f64 = call @llvm.vp.uitofp.nxv32i32.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f32 = call @llvm.vp.uitofp.nxv32i64.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f64 = call @llvm.vp.uitofp.nxv32i64.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f32 = call @llvm.vp.uitofp.nxv32i1.nxv32float( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f64 = call @llvm.vp.uitofp.nxv32i1.nxv32double( undef, undef, i32 undef) + %nxv64i8_nxv64f32 = uitofp undef to %nxv64i8_nxv64f64 = uitofp undef to %nxv64i16_nxv64f32 = uitofp undef to @@ -3489,6 +6883,17 @@ define void @uitofp() { %nxv64i1_nxv64f32 = uitofp undef to %nxv64i1_nxv64f64 = uitofp undef to + %vp_nxv64fi8_nxv64f32 = call @llvm.vp.uitofp.nxv64i8.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f64 = call @llvm.vp.uitofp.nxv64i8.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f32 = call @llvm.vp.uitofp.nxv64i16.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f64 = call @llvm.vp.uitofp.nxv64i16.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f32 = call @llvm.vp.uitofp.nxv64i32.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f64 = call @llvm.vp.uitofp.nxv64i32.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f32 = call @llvm.vp.uitofp.nxv64i64.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f64 = call @llvm.vp.uitofp.nxv64i64.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f32 = call @llvm.vp.uitofp.nxv64i1.nxv64float( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f64 = call @llvm.vp.uitofp.nxv64i1.nxv64double( undef, undef, i32 undef) + ret void } From 76be3a0024fe0027bcba9a597fee32a8b2d962ae Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 4 Sep 2024 22:12:01 -0700 Subject: [PATCH 183/425] [DirectX] Fix crash in DXILOpBuilder for vector types (#107334) This function needs to return the "undefined" sigil for unknown types so that the actual error handling triggers instead of a crash. --- llvm/lib/Target/DirectX/DXILOpBuilder.cpp | 3 +-- llvm/test/CodeGen/DirectX/sin_vector_error.ll | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/sin_vector_error.ll diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index ab3ea61d05fc4..efe019a07acaa 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -123,8 +123,7 @@ static OverloadKind getOverloadKind(Type *Ty) { case Type::StructTyID: return OverloadKind::ObjectType; default: - llvm_unreachable("invalid overload type"); - return OverloadKind::VOID; + return OverloadKind::UNDEFINED; } } diff --git a/llvm/test/CodeGen/DirectX/sin_vector_error.ll b/llvm/test/CodeGen/DirectX/sin_vector_error.ll new file mode 100644 index 0000000000000..45b8d403390b9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/sin_vector_error.ll @@ -0,0 +1,11 @@ +; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0-library %s 2>&1 | FileCheck %s +; The sin intrinsic needs to be scalarized before op lowering + +; CHECK: error: +; CHECK-SAME: in function sin_vector +; CHECK-SAME: Cannot create Sin operation: Invalid overload type + +define <4 x float> @sin_vector(<4 x float> %a) { + %x = call <4 x float> @llvm.sin.v4f32(<4 x float> %a) + ret <4 x float> %x +} From 787fc81437dfc924e4a7d6106248e335e32aeeeb Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 4 Sep 2024 22:12:19 -0700 Subject: [PATCH 184/425] [DirectX] Clean up trailing whitespace. NFC (#107335) --- llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 5f48e2ba939f5..d2c0859f52a24 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -36,7 +36,7 @@ def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>; def int_dx_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; -def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; +def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; def int_dx_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_dx_dot2 : @@ -70,7 +70,7 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty]>; -def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], +def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem, IntrWillReturn] >; def int_dx_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty]>; From d18ca271f1add262b4ee0318a980f78a402f5e9c Mon Sep 17 00:00:00 2001 From: Harini0924 Date: Wed, 4 Sep 2024 22:34:05 -0700 Subject: [PATCH 185/425] Reapply "[llvm-lit] Add precommit test to verify current behavior of glob expansion in lit's internal shell" (#106763) (#107169) This reverts commit 5af4ba2684b9b59de3bf8135f62e05ab68cfc489. The previous patch was missing the closing parenthesis `)` in the `CHECK` statement in the `llvm/utils/lit/tests/shtest-glob.py` file: `# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}` This issue broke some build bots. This patch corrects the `CHECK` statement by adding the closing parenthesis: `# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})` --- .../lit/tests/Inputs/shtest-glob/example_file1.input | 2 ++ .../lit/tests/Inputs/shtest-glob/example_file2.input | 2 ++ .../utils/lit/tests/Inputs/shtest-glob/glob-echo.txt | 2 ++ .../lit/tests/Inputs/shtest-glob/glob-mkdir.txt | 2 ++ llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg | 8 ++++++++ llvm/utils/lit/tests/shtest-glob.py | 12 ++++++++++++ 6 files changed, 28 insertions(+) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg create mode 100644 llvm/utils/lit/tests/shtest-glob.py diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input new file mode 100644 index 0000000000000..0987c9081ca1f --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input @@ -0,0 +1,2 @@ +## This is the first example file used for testing glob pattern matching. +This is the first example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input new file mode 100644 index 0000000000000..f1a843f308262 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input @@ -0,0 +1,2 @@ +## This is the second example file used for testing glob pattern matching. +This is the second example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt new file mode 100644 index 0000000000000..b69f5e74fd728 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt @@ -0,0 +1,2 @@ +## Tests glob pattern expansion by listing matching files. +# RUN: echo %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt new file mode 100644 index 0000000000000..d1329f5dbfaae --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt @@ -0,0 +1,2 @@ +## Tests glob pattern handling in the mkdir command. +# RUN: not mkdir %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg new file mode 100644 index 0000000000000..4e5f4cac4c465 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "shtest-glob" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py new file mode 100644 index 0000000000000..ae90f31907d49 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-glob.py @@ -0,0 +1,12 @@ +## Tests glob pattern handling in echo command. + +# RUN: not %{lit} -a -v %{inputs}/shtest-glob \ +# RUN: | FileCheck -dump-input=fail -match-full-lines %s +# +# END. + +# CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}}) +# CHECK: TypeError: string argument expected, got 'GlobItem' + +# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}) +# CHECK: # error: command failed with exit status: 1 From 16cda01d22c0ac1713f667d501bdca91594a4e13 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Thu, 5 Sep 2024 14:39:28 +0900 Subject: [PATCH 186/425] [AMDGPU] V_SET_INACTIVE optimizations (#98864) Optimize V_SET_INACTIVE by allow it to run in WWM. Hence WWM sections are not broken up for inactive lane setting. WWM V_SET_INACTIVE can typically be lower to V_CNDMASK. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. This is a common pattern as WWM register pre-allocation often assigns the same register. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 187 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 89 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 401 ++-- .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 30 +- .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 29 +- .../atomic_optimizations_global_pointer.ll | 536 ++--- .../atomic_optimizations_local_pointer.ll | 1913 ++++++----------- .../atomic_optimizations_pixelshader.ll | 64 +- llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 14 +- llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll | 18 +- .../AMDGPU/global_atomics_scan_fadd.ll | 607 ++---- .../AMDGPU/global_atomics_scan_fmax.ll | 422 ++-- .../AMDGPU/global_atomics_scan_fmin.ll | 422 ++-- .../AMDGPU/global_atomics_scan_fsub.ll | 607 ++---- .../llvm.amdgcn.set.inactive.chain.arg.ll | 389 ++-- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 382 ++-- .../AMDGPU/set-inactive-wwm-overwrite.ll | 12 +- .../AMDGPU/should-not-hoist-set-inactive.ll | 5 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 64 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 52 +- llvm/test/CodeGen/AMDGPU/wqm.mir | 23 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 704 +++--- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 504 +++-- .../MIR/AMDGPU/machine-function-info.ll | 7 +- 25 files changed, 3205 insertions(+), 4278 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a857bdba53c3e..844f62abc2671 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2098,8 +2098,22 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } } +Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) { + assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64); + for (auto &Op : MI.implicit_operands()) { + if (Op.isDef()) + continue; + Register OpReg = Op.getReg(); + if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO || + OpReg == AMDGPU::SCC) + continue; + return OpReg; + } + return Register(); +} + bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { @@ -2273,37 +2287,147 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_SET_INACTIVE_B32: { - unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - // FIXME: We may possibly optimize the COPY once we find ways to make LLVM - // optimizations (mainly Register Coalescer) aware of WWM register liveness. - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); - MI.eraseFromParent(); - break; - } + case AMDGPU::V_SET_INACTIVE_B32: case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), - MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - expandPostRAPseudo(*Copy); - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten - Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), - MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); - expandPostRAPseudo(*Copy); - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64 + ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32; + Register ExecReg = RI.getExec(); + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand &ActiveSrc = MI.getOperand(1); + MachineOperand &InactiveSrc = MI.getOperand(2); + + // Find implicit register defining lanes active outside WWM. + Register ExecSrcReg = findSetInactiveMask(MI); + assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region"); + // Note: default here is set to ExecReg so that functional MIR is still + // generated if implicit def is not found and assertions are disabled. + if (!ExecSrcReg) + ExecSrcReg = ExecReg; + + // Ideally in WWM this operation is lowered to V_CNDMASK; however, + // constant bus constraints and the presence of literal constants + // present an issue. + // Fallback to V_MOV base lowering in all but the common cases. + const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32; + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64; + const MCInstrDesc &Desc = get(Opcode); + + const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0); + const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0); + const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue()); + const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue()); + const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue()); + const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue()); + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + + int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64); + int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + int ConstantBusUses = + 1 + // Starts at 1 for ExecSrcReg + (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) + + (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0); + int LiteralConstants = + ((ActiveSrc.isReg() || + (ActiveSrc.isImm() && isInlineConstant(ActiveImm))) + ? 0 + : 1) + + ((InactiveSrc.isReg() || + (InactiveSrc.isImm() && isInlineConstant(InactiveImm))) + ? 0 + : 1); + + bool UseVCndMask = + ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit; + if (VMov64 && UseVCndMask) { + // Decomposition must not introduce new literals. + UseVCndMask &= + ActiveSrc.isReg() || + (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) || + (!isInlineConstant(ActiveImm)); + UseVCndMask &= InactiveSrc.isReg() || + (isInlineConstant(InactiveImmLo) && + isInlineConstant(InactiveImmHi)) || + (!isInlineConstant(InactiveImm)); + } + + if (UseVCndMask && VMov64) { + // Dual V_CNDMASK_B32 + MachineOperand ActiveLo = buildExtractSubRegOrImm( + MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr); + MachineOperand ActiveHi = buildExtractSubRegOrImm( + MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr); + MachineOperand InactiveLo = buildExtractSubRegOrImm( + MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr); + MachineOperand InactiveHi = buildExtractSubRegOrImm( + MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr); + if (ActiveSrc.isReg()) + ActiveHi.setIsKill(ActiveSrc.isKill()); + if (InactiveSrc.isReg()) + InactiveHi.setIsKill(InactiveSrc.isKill()); + BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0)) + .addImm(0) + .add(InactiveLo) + .addImm(0) + .add(ActiveLo) + .addReg(ExecSrcReg) + .addReg(DstReg, RegState::ImplicitDefine); + BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1)) + .addImm(0) + .add(InactiveHi) + .addImm(0) + .add(ActiveHi) + .addReg(ExecSrcReg) + .addReg(DstReg, RegState::ImplicitDefine); + } else if (UseVCndMask) { + // Single V_CNDMASK_B32 + BuildMI(MBB, MI, DL, Desc, DstReg) + .addImm(0) + .add(InactiveSrc) + .addImm(0) + .add(ActiveSrc) + .addReg(ExecSrcReg); + } else { + // Fallback V_MOV case. + // Avoid unnecessary work if a source VGPR is also the destination. + // This can happen if WWM register allocation was efficient. + // Note: this assumes WWM execution. + bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg; + bool DstIsInactive = + InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg; + if (!DstIsInactive) { + // Set exec mask to inactive lanes, + // but only if active lanes would be overwritten. + if (DstIsActive) { + BuildMI(MBB, MI, DL, get(NotOpc), ExecReg) + .addReg(ExecSrcReg) + .setOperandDead(3); // Dead scc + } + // Copy inactive lanes + MachineInstr *VMov = + BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc); + if (VMov64) + expandPostRAPseudo(*VMov); + } + if (!DstIsActive) { + // Set exec mask to active lanes + BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg); + // Copy active lanes + MachineInstr *VMov = + BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg()) + .add(ActiveSrc); + if (VMov64) + expandPostRAPseudo(*VMov); + } + // Restore WWM + BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1); + } MI.eraseFromParent(); break; } @@ -5647,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg( MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const { + if (!SuperReg.getReg().isVirtual()) + return RI.getSubReg(SuperReg.getReg(), SubIdx); + MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4fd9b4366159b..71432510fdee4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,6 +1437,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // This is used if an operand is a 32 bit register but needs to be aligned // regardless. void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; + + static Register findSetInactiveMask(const MachineInstr &MI); }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 9a51cbbb9f6b8..bc4b1936cb7e3 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -128,6 +128,7 @@ struct InstrInfo { char Needs = 0; char Disabled = 0; char OutNeeds = 0; + char MarkedStates = 0; }; struct BlockInfo { @@ -175,9 +176,10 @@ class SIWholeQuadMode : public MachineFunctionPass { SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; - SmallVector LowerToCopyInstrs; + SmallSetVector LowerToCopyInstrs; SmallVector KillInstrs; SmallVector InitExecInstrs; + SmallVector SetInactiveInstrs; void printInfo(); @@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, assert(!(Flag & StateExact) && Flag != 0); + // Capture all states requested in marking including disabled ones. + II.MarkedStates |= Flag; + // Remove any disabled states from the flag. The user that required it gets // an undefined value in the helper lanes. For example, this can happen if // the result of an atomic is used by instruction that requires WQM, where @@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); - SmallVector SetInactiveInstrs; SmallVector SoftWQMInstrs; bool HasImplicitDerivatives = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; @@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // The WQM intrinsic requires its output to have all the helper lanes // correct, so we need it to be in WQM. Flags = StateWQM; - LowerToCopyInstrs.push_back(&MI); + LowerToCopyInstrs.insert(&MI); } else if (Opcode == AMDGPU::SOFT_WQM) { - LowerToCopyInstrs.push_back(&MI); + LowerToCopyInstrs.insert(&MI); SoftWQMInstrs.push_back(&MI); } else if (Opcode == AMDGPU::STRICT_WWM) { // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus @@ -555,16 +559,24 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateStrictWQM; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { - III.Disabled = StateStrict; - MachineOperand &Inactive = MI.getOperand(2); - if (Inactive.isReg()) { - if (Inactive.isUndef()) { - LowerToCopyInstrs.push_back(&MI); - } else { - markOperand(MI, Inactive, StateStrictWWM, Worklist); + // Ignore these if V_SET_INACTIVE which already has exec src register. + // These are generated by an earlier pass which has seperately ensured + // WWM and provided a mask of inactive lanes. + Register ExecSrc = TII->findSetInactiveMask(MI); + if (!ExecSrc) { + // Disable strict states; StrictWQM will be added as required later. + III.Disabled = StateStrict; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.insert(&MI); + } else { + markOperand(MI, Inactive, StateStrictWWM, Worklist); + } } + SetInactiveInstrs.push_back(&MI); + BBI.NeedsLowering = true; } - SetInactiveInstrs.push_back(&MI); } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -1042,6 +1054,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); SmallVector SplitPoints; + Register ActiveLanesReg = 0; char State = BI.InitialState; for (MachineInstr &MI : llvm::make_early_inc_range( @@ -1058,6 +1071,21 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; + case AMDGPU::ENTER_STRICT_WWM: + ActiveLanesReg = MI.getOperand(0).getReg(); + break; + case AMDGPU::EXIT_STRICT_WWM: + ActiveLanesReg = 0; + break; + case AMDGPU::V_SET_INACTIVE_B32: + case AMDGPU::V_SET_INACTIVE_B64: + if (ActiveLanesReg) { + MI.addOperand(*MBB.getParent(), + MachineOperand::CreateReg(ActiveLanesReg, false, true)); + } else { + assert(State == StateExact || State == StateWQM); + } + break; default: break; } @@ -1497,13 +1525,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() { } } for (MachineInstr *MI : LowerToCopyInstrs) { + LLVM_DEBUG(dbgs() << "simplify: " << *MI); + + Register RecomputeReg = 0; if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { assert(MI->getNumExplicitOperands() == 3); - // the only reason we should be here is V_SET_INACTIVE has - // an undef input so it is being replaced by a simple copy. - // There should be a second undef source that we should remove. - assert(MI->getOperand(2).isUndef()); + if (MI->getOperand(2).isReg()) + RecomputeReg = MI->getOperand(2).getReg(); MI->removeOperand(2); MI->untieRegOperand(1); } else { @@ -1514,7 +1543,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() { ? (unsigned)AMDGPU::COPY : TII->getMovOpcode(TRI->getRegClassForOperandReg( *MRI, MI->getOperand(0))); + int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); + while (Index >= 0) { + MI->removeOperand(Index); + Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); + } + MI->setDesc(TII->get(CopyOp)); + LLVM_DEBUG(dbgs() << " -> " << *MI); + + if (RecomputeReg) { + LIS->removeInterval(RecomputeReg); + LIS->createAndComputeVirtRegInterval(RecomputeReg); + } } return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } @@ -1656,6 +1697,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToMovInstrs.clear(); KillInstrs.clear(); InitExecInstrs.clear(); + SetInactiveInstrs.clear(); StateTransition.clear(); ST = &MF.getSubtarget(); @@ -1712,6 +1754,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Changed = true; } + // Check if V_SET_INACTIVE was touched by a strict state mode. + // If so, promote to WWM; otherwise lower to COPY. + for (MachineInstr *MI : SetInactiveInstrs) { + if (LowerToCopyInstrs.contains(MI)) + continue; + if (Instructions[MI].MarkedStates & StateStrict) { + Instructions[MI].Needs |= StateStrictWWM; + Instructions[MI].Disabled &= ~StateStrictWWM; + Blocks[MI->getParent()].Needs |= StateStrictWWM; + } else { + LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI); + LowerToCopyInstrs.insert(MI); + } + } + LLVM_DEBUG(printInfo()); Changed |= lowerLiveMaskQueries(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 8f88aaedf7e95..137366a45cbdf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,18 +4,39 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) + store i32 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -24,18 +45,42 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) + store i64 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison_64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -45,39 +90,43 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c +; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s6, 56 ; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, 1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_cmp_lg_u32 s3, 0 -; GCN-NEXT: s_cbranch_scc0 .LBB2_2 +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %.one -; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: .LBB2_2: ; %Flow +; GCN-NEXT: .LBB4_2: ; %Flow ; GCN-NEXT: s_xor_b32 s2, s2, 1 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB2_4 +; GCN-NEXT: s_cbranch_scc1 .LBB4_4 ; GCN-NEXT: ; %bb.3: ; %.zero ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: .LBB2_4: ; %.exit +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: .LBB4_4: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) br i1 %cmp, label %.zero, label %.one .zero: @@ -96,19 +145,22 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0) store float %tmp, ptr addrspace(1) %out ret void } @@ -117,20 +169,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd -; GCN-NEXT: v_mov_b32_e32 v3, 0x4010cccc +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd +; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0) store double %tmp, ptr addrspace(1) %out ret void } @@ -138,19 +193,22 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0) store <2 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -158,19 +216,22 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0) store <2 x half> %tmp, ptr addrspace(1) %out ret void } @@ -179,22 +240,25 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 1 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0) store <2 x i32> %tmp, ptr addrspace(1) %out ret void } @@ -203,22 +267,25 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1.0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 1.0 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0) store <2 x float> %tmp, ptr addrspace(1) %out ret void } @@ -226,19 +293,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0) store <2 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -247,22 +317,25 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x10001 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) store <4 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -271,22 +344,25 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) store <4 x half> %tmp, ptr addrspace(1) %out ret void } @@ -295,22 +371,25 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s6, 0x3f803f80 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) store <4 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -319,18 +398,23 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0) store ptr %tmp, ptr addrspace(1) %out ret void } @@ -338,18 +422,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0) store ptr addrspace(2) %tmp, ptr addrspace(1) %out ret void } @@ -357,18 +445,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0) store ptr addrspace(3) %tmp, ptr addrspace(1) %out ret void } @@ -376,18 +468,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0) store ptr addrspace(5) %tmp, ptr addrspace(1) %out ret void } @@ -395,24 +491,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0) store ptr addrspace(6) %tmp, ptr addrspace(1) %out ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) attributes #0 = { convergent readnone } +attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index c92b78cd45573..e34ae52fc673a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -571,11 +571,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop @@ -591,10 +590,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop @@ -609,11 +607,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; DAGISEL-GFX11-LABEL: chain_to_chain_wwm: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART @@ -629,11 +626,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b ; DAGISEL-GFX10-LABEL: chain_to_chain_wwm: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index 8d9ed9bb4343c..320268564f4db 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -329,10 +329,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND @@ -351,10 +351,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND @@ -371,11 +370,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND @@ -393,11 +391,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 -; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index cc7050d08541a..5a8df7b84bf2f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1147,11 +1147,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1200,11 +1198,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1247,13 +1243,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1310,11 +1303,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1364,27 +1354,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1438,35 +1425,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -1500,27 +1485,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: add_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1578,33 +1560,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: add_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -2918,15 +2897,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3018,15 +2991,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3115,47 +3082,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -3228,40 +3189,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3323,21 +3278,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -3441,22 +3390,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3537,56 +3481,50 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 @@ -3658,35 +3596,29 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4894,11 +4826,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4947,11 +4877,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4994,13 +4922,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5057,11 +4982,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5111,27 +5033,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5185,35 +5104,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -5247,27 +5164,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: sub_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5325,33 +5239,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: sub_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 @@ -6707,15 +6618,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6807,15 +6712,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6904,47 +6803,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -7017,40 +6910,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -7112,21 +6999,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -7230,22 +7111,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7326,56 +7202,50 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 @@ -7447,35 +7317,29 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6d0e0cc7869b3..6bf03a202c143 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -780,14 +780,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -827,14 +825,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -870,13 +866,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -925,13 +918,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -971,33 +961,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -1037,27 +1024,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -1315,11 +1299,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1350,11 +1332,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1381,11 +1361,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1064_DPP-LABEL: add_i32_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1414,11 +1391,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1032_DPP-LABEL: add_i32_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1442,34 +1416,32 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_DPP-LABEL: add_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164_DPP-NEXT: ; %bb.1: @@ -1482,26 +1454,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_DPP-LABEL: add_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -2398,15 +2368,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2493,15 +2457,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2585,47 +2543,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -2689,43 +2641,37 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -2734,10 +2680,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -2779,21 +2725,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -2891,23 +2831,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3235,15 +3170,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3312,15 +3241,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3386,23 +3309,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3451,22 +3368,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3507,21 +3420,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 @@ -3575,22 +3482,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4400,14 +4302,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4447,14 +4347,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4490,13 +4388,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4545,13 +4440,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4591,33 +4483,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -4657,27 +4546,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -4935,11 +4821,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4970,11 +4854,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5001,11 +4883,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1064_DPP-LABEL: sub_i32_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5034,11 +4913,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1032_DPP-LABEL: sub_i32_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5062,34 +4938,32 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_DPP-LABEL: sub_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: @@ -5102,26 +4976,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_DPP-LABEL: sub_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -6044,15 +5916,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6139,15 +6005,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6231,47 +6091,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 @@ -6335,43 +6189,37 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -6380,10 +6228,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -6425,21 +6273,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -6537,23 +6379,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf @@ -6943,13 +6780,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -6992,13 +6825,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7037,13 +6866,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: and_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7092,13 +6918,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: and_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7138,33 +6961,30 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: and_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -7204,31 +7024,29 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -7619,16 +7437,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 @@ -7683,16 +7495,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 @@ -7741,19 +7547,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: and_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -7822,19 +7624,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: and_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -7846,11 +7644,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -7885,47 +7683,43 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: and_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7981,43 +7775,39 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -8375,14 +8165,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8422,14 +8210,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8465,13 +8251,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8520,13 +8303,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: or_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8566,33 +8346,30 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: or_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -8632,27 +8409,24 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -9047,16 +8821,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -9111,16 +8879,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -9169,19 +8931,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9250,19 +9008,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: or_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9274,11 +9028,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -9313,47 +9067,43 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: or_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -9409,43 +9159,39 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -9803,14 +9549,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9850,14 +9594,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9893,13 +9635,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9948,13 +9687,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: xor_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9994,33 +9730,30 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: xor_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -10060,27 +9793,24 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -10475,16 +10205,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -10539,16 +10263,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -10597,19 +10315,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10678,19 +10392,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: xor_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10702,11 +10412,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 @@ -10741,47 +10451,43 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: xor_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -10837,43 +10543,39 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 @@ -11232,12 +10934,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11281,12 +10982,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11325,13 +11025,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: max_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -11380,13 +11077,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: max_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -11426,33 +11120,30 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: max_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -11492,31 +11183,29 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: max_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -12195,19 +11884,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: s_brev_b32 s1, 1 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -12294,19 +11983,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: s_brev_b32 s1, 1 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -12393,20 +12082,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064_DPP-NEXT: s_brev_b32 s5, 1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12515,20 +12198,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo @@ -12610,77 +12287,70 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12744,55 +12414,48 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 @@ -13158,12 +12821,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13207,12 +12869,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13251,13 +12912,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: min_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -13306,13 +12964,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: min_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -13352,33 +13007,30 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: min_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -13418,31 +13070,29 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: min_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -14124,16 +13774,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: s_mov_b32 s6, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 ; GFX8_DPP-NEXT: s_brev_b32 s7, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14221,16 +13871,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: s_brev_b32 s7, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14321,14 +13971,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14433,17 +14077,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14530,77 +14168,70 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14663,56 +14294,49 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15076,14 +14700,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15123,14 +14745,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15166,13 +14786,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15221,13 +14838,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15267,33 +14881,30 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umax_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -15333,27 +14944,24 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umax_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -16033,14 +15641,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -16131,14 +15733,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -16227,13 +15823,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16337,20 +15929,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16433,77 +16019,70 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16564,56 +16143,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 @@ -16978,13 +16550,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17027,13 +16595,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17072,13 +16636,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umin_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -17127,13 +16688,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -17173,33 +16731,30 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umin_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 @@ -17239,31 +16794,29 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umin_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 @@ -17939,14 +17492,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -18037,14 +17584,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -18133,13 +17674,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18243,20 +17780,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18339,77 +17870,70 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18470,56 +17994,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 22eb8d05b5ff2..429e6c489bf6f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: s_mov_b64 exec, s[10:11] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11] +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: s_mov_b64 exec, s[10:11] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz .LBB1_4 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -449,13 +442,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz .LBB1_4 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -503,32 +493,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1164-NEXT: s_cbranch_execz .LBB1_4 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v1, -1, -1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 -; GFX1164-NEXT: v_mov_b32_e32 v2, s12 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s12 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 @@ -577,31 +565,29 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1132-NEXT: s_cbranch_execz .LBB1_4 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v1, -1, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 0d74bd39b56fe..7aca63d34f51b 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: s_or_saveexec_b32 s4, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s4, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: v_mov_b32_e32 v5, 0 @@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s5, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: v_mov_b32_e32 v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index 82dc6d21cfe33..310f32ce8f83b 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -6,16 +6,13 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-LABEL: wwm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_mov_b32 s1, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s0, 0 @@ -63,16 +60,13 @@ work: define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-LABEL: strict_wwm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: s_mov_b32 s7, s4 ; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_mov_b32 s1, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 584b280cefb8a..311c609291886 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -816,12 +816,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -900,14 +898,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -981,14 +974,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1046,43 +1034,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1113,15 +1096,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2049,12 +2027,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2133,14 +2109,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2214,14 +2185,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2279,43 +2245,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2346,15 +2307,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3342,12 +3298,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3426,14 +3380,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3507,14 +3456,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3572,43 +3516,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3639,15 +3578,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4131,12 +4065,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4215,14 +4147,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4296,14 +4223,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4361,43 +4283,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4428,15 +4345,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5449,12 +5361,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5533,14 +5443,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5614,14 +5519,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5679,43 +5579,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5759,15 +5654,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7442,14 +7332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -7579,15 +7463,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7707,15 +7585,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7819,17 +7691,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -7864,11 +7731,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -7946,16 +7813,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9047,14 +8909,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -9152,15 +9008,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9246,15 +9096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9324,17 +9168,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9366,9 +9205,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9419,16 +9259,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10497,14 +10332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -10602,15 +10431,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10696,15 +10519,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10774,17 +10591,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -10816,9 +10628,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -10869,16 +10682,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11429,14 +11237,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -11534,15 +11336,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11628,15 +11424,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11706,17 +11496,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11748,9 +11533,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -11801,16 +11587,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13526,14 +13307,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -13663,15 +13438,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13791,15 +13560,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13903,17 +13666,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -13948,11 +13706,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -14030,16 +13788,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 464ec088dc297..9dc82b17bd3f4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 26a0e34d18bdb..945583c88ce26 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index c158a8007bcc5..3bc0f2546794d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -894,12 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -978,14 +976,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1059,14 +1052,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -1124,43 +1112,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1204,15 +1187,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2239,12 +2217,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2323,14 +2299,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2404,14 +2375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -2469,43 +2435,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2549,15 +2510,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3584,12 +3540,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3668,14 +3622,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3749,14 +3698,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3814,43 +3758,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3894,15 +3833,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4425,12 +4359,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4509,14 +4441,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4590,14 +4517,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -4655,43 +4577,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4735,15 +4652,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5769,12 +5681,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5853,14 +5763,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5934,14 +5839,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -5999,43 +5899,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -6079,15 +5974,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7762,14 +7652,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -7899,15 +7783,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8027,15 +7905,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8139,17 +8011,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -8184,11 +8051,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -8266,16 +8133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9366,14 +9228,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -9471,15 +9327,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9565,15 +9415,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9643,17 +9487,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9685,9 +9524,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9738,16 +9578,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10816,14 +10651,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -10921,15 +10750,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11015,15 +10838,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11093,17 +10910,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11135,9 +10947,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -11188,16 +11001,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11748,14 +11556,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-DPP-NEXT: s_nop 0 @@ -11853,15 +11655,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11947,15 +11743,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -12025,17 +11815,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -12067,9 +11852,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -12120,16 +11906,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13844,14 +13625,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-DPP-NEXT: s_nop 0 @@ -13981,15 +13756,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -14109,15 +13878,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -14221,17 +13984,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -14266,11 +14024,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -14348,16 +14106,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index b3acd4949301e..c1b58f1795aae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -12,97 +12,204 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11-LABEL: set_inactive_chain_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: global_store_b32 v[8:9], v0, off +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: global_store_b32 v[8:9], v1, off ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, v10 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: global_store_dword v[8:9], v0, off +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: global_store_dword v[8:9], v1, off ; GFX10-NEXT: s_endpgm ; ; GFX11_W64-LABEL: set_inactive_chain_arg: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11_W64-NEXT: global_store_b32 v[8:9], v1, off ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: global_store_dword v[8:9], v0, off +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10_W64-NEXT: global_store_dword v[8:9], v1, off ; GFX10_W64-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0 - store i32 %tmp, ptr addrspace(1) %out + %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp) + store i32 %wwm, ptr addrspace(1) %out ret void } define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) { -; GFX11-LABEL: set_inactive_chain_arg_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-NEXT: v_mov_b32_e32 v1, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11-NEXT: s_endpgm +; GISEL11-LABEL: set_inactive_chain_arg_64: +; GISEL11: ; %bb.0: +; GISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 +; GISEL11-NEXT: v_mov_b32_e32 v2, v0 +; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL11-NEXT: v_mov_b32_e32 v3, v1 +; GISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; GISEL11-NEXT: s_endpgm ; -; GFX10-LABEL: set_inactive_chain_arg_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-NEXT: v_mov_b32_e32 v1, v13 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v10 -; GFX10-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: global_store_dwordx2 v[8:9], v[0:1], off -; GFX10-NEXT: s_endpgm +; DAGISEL11-LABEL: set_inactive_chain_arg_64: +; DAGISEL11: ; %bb.0: +; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11-NEXT: s_endpgm ; -; GFX11_W64-LABEL: set_inactive_chain_arg_64: -; GFX11_W64: ; %bb.0: -; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v12 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, v13 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11_W64-NEXT: s_endpgm +; GISEL10-LABEL: set_inactive_chain_arg_64: +; GISEL10: ; %bb.0: +; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_mov_b32_e32 v0, v10 +; GISEL10-NEXT: v_mov_b32_e32 v1, v11 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; GISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 +; GISEL10-NEXT: v_mov_b32_e32 v2, v0 +; GISEL10-NEXT: v_mov_b32_e32 v3, v1 +; GISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; GISEL10-NEXT: s_endpgm ; -; GFX10_W64-LABEL: set_inactive_chain_arg_64: -; GFX10_W64: ; %bb.0: -; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v12 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, v13 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: global_store_dwordx2 v[8:9], v[0:1], off -; GFX10_W64-NEXT: s_endpgm +; DAGISEL10-LABEL: set_inactive_chain_arg_64: +; DAGISEL10: ; %bb.0: +; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10-NEXT: s_endpgm +; +; GISEL11_W64-LABEL: set_inactive_chain_arg_64: +; GISEL11_W64: ; %bb.0: +; GISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; GISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 +; GISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; GISEL11_W64-NEXT: s_endpgm +; +; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64: +; DAGISEL11_W64: ; %bb.0: +; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11_W64-NEXT: s_endpgm +; +; GISEL10_W64-LABEL: set_inactive_chain_arg_64: +; GISEL10_W64: ; %bb.0: +; GISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; GISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 +; GISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; GISEL10_W64-NEXT: s_endpgm +; +; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64: +; DAGISEL10_W64: ; %bb.0: +; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 +; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10_W64-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0 - store i64 %tmp, ptr addrspace(1) %out + %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp) + store i64 %wwm, ptr addrspace(1) %out ret void } @@ -113,16 +220,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, v10 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v0 -; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: global_store_b32 v[8:9], v2, off ; GFX11-NEXT: s_endpgm @@ -133,11 +237,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX10-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, v10 ; GFX10-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v0 -; GFX10-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX10-NEXT: s_mov_b32 exec_lo, s0 @@ -151,17 +252,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX11_W64-NEXT: s_not_b64 exec, exec -; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, v0 -; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11_W64-NEXT: s_waitcnt_depctr 0xfff +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1 ; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off ; GFX11_W64-NEXT: s_endpgm @@ -172,11 +269,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11 -; GFX10_W64-NEXT: s_not_b64 exec, exec -; GFX10_W64-NEXT: v_mov_b32_e32 v0, v0 -; GFX10_W64-NEXT: s_not_b64 exec, exec ; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] @@ -214,11 +308,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11-NEXT: s_endpgm @@ -244,11 +337,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 ; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11-NEXT: s_endpgm @@ -283,10 +375,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL10-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL10-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10-NEXT: global_store_dword v[41:42], v0, off ; GISEL10-NEXT: s_endpgm @@ -321,10 +412,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10-NEXT: s_endpgm @@ -357,11 +447,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11_W64-NEXT: s_endpgm @@ -394,11 +483,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11_W64-NEXT: s_endpgm @@ -433,10 +521,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10_W64-NEXT: s_not_b64 exec, exec -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; GISEL10_W64-NEXT: s_endpgm @@ -471,10 +558,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10_W64-NEXT: s_endpgm @@ -511,11 +597,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 ; GISEL11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL11-NEXT: s_mov_b32 exec_lo, s0 ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11-NEXT: s_endpgm @@ -541,11 +626,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11-NEXT: s_endpgm @@ -580,10 +664,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; GISEL10-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; GISEL10-NEXT: s_mov_b32 exec_lo, s0 ; GISEL10-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10-NEXT: global_store_dword v[41:42], v0, off ; GISEL10-NEXT: s_endpgm @@ -618,10 +701,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo -; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10-NEXT: s_endpgm @@ -654,11 +736,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL11_W64-NEXT: s_not_b64 exec, exec -; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; GISEL11_W64-NEXT: s_endpgm @@ -691,11 +772,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0 ; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1] -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL11_W64-NEXT: s_not_b64 exec, exec -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off ; DAGISEL11_W64-NEXT: s_endpgm @@ -730,10 +810,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; GISEL10_W64-NEXT: s_not_b64 exec, exec -; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; GISEL10_W64-NEXT: s_not_b64 exec, exec +; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; GISEL10_W64-NEXT: s_endpgm @@ -768,10 +847,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40 -; DAGISEL10_W64-NEXT: s_not_b64 exec, exec +; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1] +; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off ; DAGISEL10_W64-NEXT: s_endpgm @@ -786,6 +864,7 @@ declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) declare amdgpu_gfx void @gfx_callee(<12 x i32>) attributes #0 = { convergent readnone willreturn nocallback nofree} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 114d2d099ab7b..6dc4a2ce0504b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,18 +5,22 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -25,13 +29,15 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -40,20 +46,25 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -63,13 +74,16 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -82,12 +96,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -96,19 +113,20 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: .LBB4_2: ; %.exit ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB4_3: ; %.one -; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-NEXT: s_cbranch_execnz .LBB4_2 ; GCN-NEXT: .LBB4_4: ; %.zero ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) br i1 %cmp, label %.zero, label %.one .zero: @@ -127,19 +145,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x40400000 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x40400000 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 + %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0) store float %tmp, ptr addrspace(1) %out ret void } @@ -148,22 +170,27 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s0, 0xcccccccd -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s1, 0x4010cccc -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 + %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0) store double %tmp, ptr addrspace(1) %out ret void } @@ -171,19 +198,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x10001 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x10001 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> ) #0 + %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0) store <2 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -191,19 +222,23 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> ) #0 + %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0) store <2 x half> %tmp, ptr addrspace(1) %out ret void } @@ -212,22 +247,27 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 1 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 1 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 + %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0) store <2 x i32> %tmp, ptr addrspace(1) %out ret void } @@ -236,22 +276,27 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 1.0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 1.0 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 + %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0) store <2 x float> %tmp, ptr addrspace(1) %out ret void } @@ -259,19 +304,23 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s7, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 + %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0) store <2 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -280,22 +329,27 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x10001 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x10001 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 + %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) store <4 x i16> %tmp, ptr addrspace(1) %out ret void } @@ -304,22 +358,27 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 + %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) store <4 x half> %tmp, ptr addrspace(1) %out ret void } @@ -328,22 +387,27 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x3f803f80 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 + %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) store <4 x bfloat> %tmp, ptr addrspace(1) %out ret void } @@ -352,20 +416,25 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 + %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0) store ptr %tmp, ptr addrspace(1) %out ret void } @@ -373,18 +442,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 + %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0) store ptr addrspace(2) %tmp, ptr addrspace(1) %out ret void } @@ -392,18 +465,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 + %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0) store ptr addrspace(3) %tmp, ptr addrspace(1) %out ret void } @@ -411,18 +488,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 + %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0) store ptr addrspace(5) %tmp, ptr addrspace(1) %out ret void } @@ -430,24 +511,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 + %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0) store ptr addrspace(6) %tmp, ptr addrspace(1) %out ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) attributes #0 = { convergent readnone } +attributes #1 = { convergent nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 81858bd3d29ee..f60786c1bacbf 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: s_or_saveexec_b32 s1, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 @@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN-NEXT: .LBB1_5: ; %.else ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v2, v3 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s1, -1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1 ; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 09e342fe19066..90b32e29e98f6 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s9 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index c3a81771a2790..ff692acda3c25 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1674,13 +1674,13 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, 42 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s4, s2 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: global_store_dword v1, v2, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: @@ -1688,15 +1688,16 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v0, 42 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s4, s[2:3] +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: global_store_dword v1, v2, s[0:1] ; GFX1064-NEXT: s_endpgm - %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) store i32 %tmp, ptr addrspace(1) %out ret void } @@ -1705,31 +1706,32 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s6 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s7 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1] +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX1064-NEXT: s_endpgm - %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) store i64 %tmp, ptr addrspace(1) %out ret void } @@ -2921,6 +2923,8 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) declare float @llvm.amdgcn.strict.wwm.f32(float) +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) declare float @llvm.amdgcn.wwm.f32(float) declare i32 @llvm.amdgcn.wqm.i32(i32) declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 6b4c2da772cdc..ab84c0c905771 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 @@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 @@ -1317,7 +1311,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_nop 0 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen -; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1334,7 +1328,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX10-W32-NEXT: s_clause 0x1 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) @@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) @@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 @@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 @@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: s_not_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) @@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index ef6d0780f395f..64a7c4457395c 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -40,6 +40,9 @@ define amdgpu_vs void @no_wqm_in_vs() { ret void } + define amdgpu_ps void @preloaded_set_inactive() { + ret void + } ... --- @@ -282,10 +285,10 @@ body: | # #CHECK-NOT: ENTER_STRICT_WWM #CHECK: BUFFER_LOAD_DWORDX2 -#CHECK-NOT: ENTER_STRICT_WWM +#CHECK: ENTER_STRICT_WWM #CHECK: V_SET_INACTIVE_B32 #CHECK: V_SET_INACTIVE_B32 -#CHECK: ENTER_STRICT_WWM +#CHECK-NOT: ENTER_STRICT_WWM #CHECK: V_MAX name: test_wwm_set_inactive_propagation tracksRegLiveness: true @@ -443,3 +446,19 @@ body: | %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ... + +--- +# Preserve V_SET_INACTIVE with exec mask already specified +#CHECK-LABEL: name: preloaded_set_inactive +#CHECK: V_SET_INACTIVE_B32 +name: preloaded_set_inactive +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64 +... diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e79cb66dcd776..47e1897f6b420 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -80,17 +80,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[34:35] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -177,11 +170,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -208,12 +201,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -270,34 +259,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -378,26 +358,26 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_mov_b32 s42, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s42, s41 -; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s43, s7 +; GFX9-O0-NEXT: s_mov_b32 s44, s43 +; GFX9-O0-NEXT: s_mov_b32 s45, s42 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 -; GFX9-O0-NEXT: s_mov_b32 s44, s35 +; GFX9-O0-NEXT: s_mov_b32 s46, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s44 -; GFX9-O0-NEXT: s_mov_b32 s38, s43 -; GFX9-O0-NEXT: s_mov_b32 s39, s42 +; GFX9-O0-NEXT: s_mov_b32 s37, s46 +; GFX9-O0-NEXT: s_mov_b32 s38, s45 +; GFX9-O0-NEXT: s_mov_b32 s39, s44 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_getpc_b64 s[42:43] ; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12 @@ -437,11 +417,11 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 @@ -559,7 +539,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s48, s33 +; GFX9-O0-NEXT: s_mov_b32 s50, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill @@ -583,41 +563,41 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 -; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: s_mov_b32 s35, s39 -; GFX9-O0-NEXT: s_mov_b32 s44, s38 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s35, s41 +; GFX9-O0-NEXT: s_mov_b32 s42, s40 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s45, s37 -; GFX9-O0-NEXT: s_mov_b32 s40, s36 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s41, s45 -; GFX9-O0-NEXT: s_mov_b32 s42, s44 -; GFX9-O0-NEXT: s_mov_b32 s43, s35 -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 +; GFX9-O0-NEXT: s_mov_b32 s43, s37 +; GFX9-O0-NEXT: s_mov_b32 s44, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47 +; GFX9-O0-NEXT: s_mov_b32 s45, s43 +; GFX9-O0-NEXT: s_mov_b32 s46, s42 +; GFX9-O0-NEXT: s_mov_b32 s47, s35 +; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -634,20 +614,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -679,14 +659,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s48 +; GFX9-O0-NEXT: s_mov_b32 s33, s50 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_mov_b32 s40, s33 +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill @@ -702,28 +682,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[36:37] -; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[34:35] +; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 @@ -739,7 +717,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: s_mov_b32 s33, s40 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) @@ -778,16 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s40, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s35 +; GFX9-O0-NEXT: s_mov_b32 s42, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -796,21 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -851,28 +835,30 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s34, -1 -; GFX9-O3-NEXT: s_brev_b32 s35, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_mov_b32 s36, -1 +; GFX9-O3-NEXT: s_brev_b32 s37, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 @@ -922,21 +908,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -987,130 +961,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46 -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45 -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44 -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39 +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38 +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37 +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34 +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39 ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44 -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec ; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -1150,62 +1104,82 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v35 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v37 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v39 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1245,16 +1219,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -1265,73 +1231,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 ; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8 ; GFX9-O3-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 ; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16 ; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-O3-NEXT: v_mov_b32_e32 v32, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v33, v2 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v32, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v33, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v34, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v34, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v35, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v36, v5 -; GFX9-O3-NEXT: v_mov_b32_e32 v37, v6 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v36, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v37, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v38, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v39, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4 -; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:12 -; GFX9-O3-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:8 -; GFX9-O3-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:20 -; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28 -; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24 -; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36 -; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32 -; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32 +; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33 +; GFX9-O3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-O3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-O3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-O3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-O3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-O3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-O3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-O3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6 @@ -1359,24 +1308,21 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] - %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0) - %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0) - %c2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0) - %d2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0) - %e2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0) + %a2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0) + %a2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %a2.i) + %b2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0) + %b2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %b2.i) + %c2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0) + %c2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %c2.i) + %d2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0) + %d2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %d2.i) + %e2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0) + %e2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %e2.i) store i64 %a2, ptr addrspace(5) %ptr %ptr_b = getelementptr i64, ptr addrspace(5) %ptr, i32 1 store i64 %b2, ptr addrspace(5) %ptr_b diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index a74dbe1de0d39..7f0db3e362b30 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -63,17 +63,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -154,11 +147,11 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -185,12 +178,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -236,34 +225,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -361,35 +341,35 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -418,13 +398,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -454,12 +434,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 @@ -613,35 +593,35 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s19 +; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -678,12 +658,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -721,14 +701,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -792,16 +772,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -810,21 +792,25 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -848,28 +834,30 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s4, -1 -; GFX9-O3-NEXT: s_brev_b32 s5, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s6, -1 +; GFX9-O3-NEXT: s_brev_b32 s7, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 @@ -927,15 +915,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -964,17 +952,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] +; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 @@ -1055,11 +1036,11 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1086,12 +1067,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -1137,34 +1114,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB8_2: ; %merge +; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1262,35 +1230,35 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1319,13 +1287,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1355,12 +1323,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 @@ -1514,35 +1482,35 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s19 +; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 +; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1579,12 +1547,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1622,14 +1590,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -1693,16 +1661,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 @@ -1711,21 +1681,25 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 @@ -1749,28 +1723,30 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s4, -1 -; GFX9-O3-NEXT: s_brev_b32 s5, -2 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s6, -1 +; GFX9-O3-NEXT: s_brev_b32 s7, -2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index b3ed7376a1ede..f73489b7db77c 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -273,12 +273,15 @@ define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg %ld1 = load volatile i32, ptr addrspace(1) %ptr %inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0) %inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0) - store volatile i32 %inactive0, ptr addrspace(1) %ptr - store volatile i32 %inactive1, ptr addrspace(1) %ptr + %wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0) + %wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1) + store volatile i32 %wwm0, ptr addrspace(1) %ptr + store volatile i32 %wwm1, ptr addrspace(1) %ptr ret void } declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6 attributes #0 = { "no-signed-zeros-fp-math" = "true" } attributes #1 = { "amdgpu-dx10-clamp" = "false" } From 96a5aabbd6adada4525d5e0107e96e6f57dbdfbf Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 4 Sep 2024 20:22:27 -0700 Subject: [PATCH 187/425] [NFC][sanitizer] Thread safety annotation for Symbolizer --- compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h index 16ef2f2fd717b..bd89dc4e302fc 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h @@ -185,17 +185,17 @@ class Symbolizer final { class ModuleNameOwner { public: explicit ModuleNameOwner(Mutex *synchronized_by) - : last_match_(nullptr), mu_(synchronized_by) { + : mu_(synchronized_by), last_match_(nullptr) { storage_.reserve(kInitialCapacity); } const char *GetOwnedCopy(const char *str); private: static const uptr kInitialCapacity = 1000; - InternalMmapVector storage_; - const char *last_match_; Mutex *mu_; + const char *last_match_ SANITIZER_GUARDED_BY(mu_); + InternalMmapVector storage_ SANITIZER_GUARDED_BY(*mu_); } module_names_; /// Platform-specific function for creating a Symbolizer object. @@ -220,7 +220,7 @@ class Symbolizer final { // always synchronized. Mutex mu_; - IntrusiveList tools_; + IntrusiveList tools_ SANITIZER_GUARDED_BY(mu_); explicit Symbolizer(IntrusiveList tools); From aafaa6943463b56db2928081dc72b116e246c249 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 23:15:25 -0700 Subject: [PATCH 188/425] [Target] Use templated MachineFunction::getSubtarget in *CallingConv.td. NFC (#107311) This hides away the static_cast needed to get the target specific Subtarget object. --- llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 10 ++++------ llvm/lib/Target/M68k/M68kCallingConv.td | 4 ++-- llvm/lib/Target/Mips/MipsCallingConv.td | 3 +-- llvm/lib/Target/PowerPC/PPCCallingConv.td | 6 ++---- llvm/lib/Target/SystemZ/SystemZCallingConv.td | 4 ++-- llvm/lib/Target/X86/X86CallingConv.td | 8 ++++---- 6 files changed, 15 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 4be64629ddac8..21412044d5a01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -214,13 +214,11 @@ def RetCC_AMDGPU_Func : CallingConv<[ ]>; def CC_AMDGPU : CallingConv<[ - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCIf<"State.getMachineFunction().getSubtarget().getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", + CCIf<"State.getMachineFunction().getSubtarget().getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", CCDelegateTo> ]>; diff --git a/llvm/lib/Target/M68k/M68kCallingConv.td b/llvm/lib/Target/M68k/M68kCallingConv.td index 523f08e646151..cf7e5ef69463e 100644 --- a/llvm/lib/Target/M68k/M68kCallingConv.td +++ b/llvm/lib/Target/M68k/M68kCallingConv.td @@ -15,8 +15,8 @@ // TODO Verify C convention follows SysV M68K ABI class CCIfSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", F), A>; + : CCIf().", F), A>; //===----------------------------------------------------------------------===// // Return Value Calling Conventions diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 204f11f1107cf..866161bf50638 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -11,8 +11,7 @@ /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget : CCIf" - "(State.getMachineFunction().getSubtarget()).", + "State.getMachineFunction().getSubtarget().", F), A>; diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 825c1a29ed62c..899326ad46656 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -13,13 +13,11 @@ /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", + : CCIf().", F), A>; class CCIfNotSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", + : CCIf().", F), A>; class CCIfOrigArgWasNotPPCF128 diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index b0618aafa5da6..99bb697ce2014 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -12,8 +12,8 @@ class CCIfExtend : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; class CCIfSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", F), + : CCIf().", F), A>; // Match if this specific argument is a fixed (i.e. named) argument. diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index c55ff3dfc9c8e..307aeb2ea4c6f 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -13,14 +13,14 @@ /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", F), + : CCIf().", F), A>; /// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F. class CCIfNotSubtarget - : CCIf" - "(State.getMachineFunction().getSubtarget()).", F), + : CCIf().", F), A>; /// CCIfRegCallv4 - Match if RegCall ABIv4 is respected. From 0c1500ef05e0a5b25cae79d2bd361dbc6e14e726 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 4 Sep 2024 23:19:14 -0700 Subject: [PATCH 189/425] [RISCV] Fix another RV32 Zdinx load/store addressing corner case. RISCVExpandPseudoInsts makes sure the offset is divisible by 8 so we need to enforce that during isel. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 +- .../CodeGen/RISCV/zdinx-boundary-check.ll | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 7800688542160..4580f3191d138 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2691,7 +2691,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, Align Alignment = commonAlignment( GA->getGlobal()->getPointerAlignment(DL), GA->getOffset()); if ((CVal == 0 || Alignment > CVal) && - (!IsRV32Zdinx || Alignment > (CVal + 4))) { + (!IsRV32Zdinx || commonAlignment(Alignment, CVal) > 4)) { int64_t CombinedOffset = CVal + GA->getOffset(); Base = Base.getOperand(0); Offset = CurDAG->getTargetGlobalAddress( diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index 01ecaee3d7e7b..a4f56b6d28409 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -493,3 +493,67 @@ entry: store double %d, ptr %add.ptr, align 8 ret void } + +@f = global double 4.2, align 16 + +define double @foo13(ptr nocapture %p) nounwind { +; RV32ZDINX-LABEL: foo13: +; RV32ZDINX: # %bb.0: # %entry +; RV32ZDINX-NEXT: addi sp, sp, -16 +; RV32ZDINX-NEXT: lui a0, %hi(f) +; RV32ZDINX-NEXT: lw a1, %lo(f+8)(a0) +; RV32ZDINX-NEXT: sw a1, 12(sp) +; RV32ZDINX-NEXT: lw a0, %lo(f+4)(a0) +; RV32ZDINX-NEXT: sw a0, 8(sp) +; RV32ZDINX-NEXT: lw a0, 8(sp) +; RV32ZDINX-NEXT: lw a1, 12(sp) +; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: ret +; +; RV32ZDINXUALIGNED-LABEL: foo13: +; RV32ZDINXUALIGNED: # %bb.0: # %entry +; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(f) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(f) +; RV32ZDINXUALIGNED-NEXT: lw a1, 8(a0) +; RV32ZDINXUALIGNED-NEXT: lw a0, 4(a0) +; RV32ZDINXUALIGNED-NEXT: ret +; +; RV64ZDINX-LABEL: foo13: +; RV64ZDINX: # %bb.0: # %entry +; RV64ZDINX-NEXT: lui a0, %hi(f) +; RV64ZDINX-NEXT: lwu a1, %lo(f+8)(a0) +; RV64ZDINX-NEXT: lwu a0, %lo(f+4)(a0) +; RV64ZDINX-NEXT: slli a1, a1, 32 +; RV64ZDINX-NEXT: or a0, a1, a0 +; RV64ZDINX-NEXT: ret +entry: + %add.ptr = getelementptr inbounds i8, ptr @f, i64 4 + %0 = load double, ptr %add.ptr, align 4 + ret double %0 +} + +define double @foo14(ptr nocapture %p) nounwind { +; RV32ZDINX-LABEL: foo14: +; RV32ZDINX: # %bb.0: # %entry +; RV32ZDINX-NEXT: lui a0, %hi(f) +; RV32ZDINX-NEXT: lw a1, %lo(f+12)(a0) +; RV32ZDINX-NEXT: lw a0, %lo(f+8)(a0) +; RV32ZDINX-NEXT: ret +; +; RV32ZDINXUALIGNED-LABEL: foo14: +; RV32ZDINXUALIGNED: # %bb.0: # %entry +; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(f) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(f+12)(a0) +; RV32ZDINXUALIGNED-NEXT: lw a0, %lo(f+8)(a0) +; RV32ZDINXUALIGNED-NEXT: ret +; +; RV64ZDINX-LABEL: foo14: +; RV64ZDINX: # %bb.0: # %entry +; RV64ZDINX-NEXT: lui a0, %hi(f) +; RV64ZDINX-NEXT: ld a0, %lo(f+8)(a0) +; RV64ZDINX-NEXT: ret +entry: + %add.ptr = getelementptr inbounds i8, ptr @f, i64 8 + %0 = load double, ptr %add.ptr, align 8 + ret double %0 +} From 77f04882251b1e44239d6d7545cd62301e903a4a Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 5 Sep 2024 08:11:29 +0100 Subject: [PATCH 190/425] [AArch64] Combine zext of deinterleaving shuffle. (#107201) This is part 1 of a few patches that are intended to take deinterleaving shuffles with masks like `[0,4,8,12]`, where the shuffle is zero-extended to a larger size, and optimize away the deinterleave. In this case it converts them to `and(uzp1, mask)`, where the `uzp1` act upon the elements in the larger type size to get the lanes into the correct possitions, and the `and` performs the zext. It performs the combine fairly late, on the legalized type so that uitofp that are converted to uitofp(zext(..)) will also be handled. --- .../Target/AArch64/AArch64ISelLowering.cpp | 56 ++++ llvm/test/CodeGen/AArch64/zext-shuffle.ll | 286 +++++++----------- 2 files changed, 163 insertions(+), 179 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5e3f9364ac3e1..d1ddbfa300846 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22187,6 +22187,59 @@ performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255) +// This comes from interleaved vectorization. It is performed late to capture +// uitofp converts too. +static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if ((VT != MVT::v4i32 && VT != MVT::v8i16) || + N->getOpcode() != ISD::ZERO_EXTEND || + N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1); + if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements()) + return SDValue(); + + EVT InVT = N->getOperand(0).getOperand(0).getValueType(); + auto *Shuffle = dyn_cast(N->getOperand(0).getOperand(0)); + if (!Shuffle || + InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 || + InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits()) + return SDValue(); + + unsigned Idx; + bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx); + // An undef interleave shuffle can come up after other canonicalizations, + // where the shuffle has been converted to + // zext(extract(shuffle b, undef, [u,u,0,4])) + bool IsUndefDeInterleave = false; + if (!IsDeInterleave) + IsUndefDeInterleave = + Shuffle->getOperand(1).isUndef() && + ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2, + VT.getVectorNumElements() / 2), + 4, Idx); + if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4) + return SDValue(); + SDLoc DL(N); + SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, + Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0)); + SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, + Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1)); + SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL, + VT, BC1, BC2); + if ((Idx & 1) == 1) + UZP = DAG.getNode(ISD::SRL, DL, VT, UZP, + DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT)); + return DAG.getNode( + ISD::AND, DL, VT, UZP, + DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT)); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -22207,6 +22260,9 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } + if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG)) + return R; + if (N->getValueType(0).isFixedLengthVector() && N->getOpcode() == ISD::SIGN_EXTEND && N->getOperand(0)->getOpcode() == ISD::SETCC) diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index 4ef8daf141715..af5a92017bbbc 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -76,12 +76,9 @@ define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) { define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -93,12 +90,8 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -110,12 +103,9 @@ define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -127,12 +117,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -167,12 +153,9 @@ define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -182,12 +165,8 @@ define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -197,12 +176,9 @@ define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -212,12 +188,8 @@ define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -249,12 +221,8 @@ define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -264,12 +232,8 @@ define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -279,12 +243,8 @@ define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -294,12 +254,8 @@ define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -310,42 +266,23 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { define <8 x i64> @zext_add(<32 x i16> %l) { ; CHECK-LABEL: zext_add: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: adrp x9, .LCPI22_3 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: adrp x8, .LCPI22_1 -; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI22_3] -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: adrp x9, .LCPI22_7 -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: ldr q18, [x9, :lo12:.LCPI22_7] -; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI22_4] -; CHECK-NEXT: adrp x8, .LCPI22_5 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v5.16b, { v0.16b, v1.16b }, v5.16b -; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI22_5] -; CHECK-NEXT: adrp x8, .LCPI22_6 -; CHECK-NEXT: tbl v7.16b, { v0.16b, v1.16b }, v7.16b -; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI22_6] -; CHECK-NEXT: tbl v17.16b, { v0.16b, v1.16b }, v17.16b -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b -; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b -; CHECK-NEXT: tbl v4.16b, { v2.16b, v3.16b }, v6.16b -; CHECK-NEXT: tbl v6.16b, { v2.16b, v3.16b }, v16.16b -; CHECK-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v19.16b -; CHECK-NEXT: uaddl v5.4s, v5.4h, v7.4h -; CHECK-NEXT: uaddl v7.4s, v17.4h, v0.4h -; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v4.8h -; CHECK-NEXT: uaddl2 v2.4s, v6.8h, v2.8h -; CHECK-NEXT: uaddl v0.2d, v5.2s, v7.2s -; CHECK-NEXT: uaddl2 v1.2d, v5.4s, v7.4s -; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v2.4s -; CHECK-NEXT: uaddl v2.2d, v4.2s, v2.2s +; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: and v3.16b, v5.16b, v4.16b +; CHECK-NEXT: and v6.16b, v0.16b, v4.16b +; CHECK-NEXT: and v7.16b, v1.16b, v4.16b +; CHECK-NEXT: and v4.16b, v2.16b, v4.16b +; CHECK-NEXT: usra v3.4s, v5.4s, #16 +; CHECK-NEXT: usra v6.4s, v0.4s, #16 +; CHECK-NEXT: usra v7.4s, v1.4s, #16 +; CHECK-NEXT: usra v4.4s, v2.4s, #16 +; CHECK-NEXT: uaddl v0.2d, v3.2s, v6.2s +; CHECK-NEXT: uaddl2 v1.2d, v3.4s, v6.4s +; CHECK-NEXT: uaddl2 v3.2d, v7.4s, v4.4s +; CHECK-NEXT: uaddl v2.2d, v7.2s, v4.2s ; CHECK-NEXT: ret %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> %z1 = zext <8 x i16> %s1 to <8 x i64> @@ -392,86 +329,77 @@ define <8 x i64> @zext_load_add(ptr %p) { define <8 x double> @uitofp_fadd(<32 x i16> %l) { ; CHECK-LABEL: uitofp_fadd: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: adrp x9, .LCPI24_1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: adrp x10, .LCPI24_6 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_0] -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: adrp x9, .LCPI24_3 -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v4.16b, { v0.16b, v1.16b }, v4.16b -; CHECK-NEXT: tbl v5.16b, { v2.16b, v3.16b }, v5.16b -; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI24_3] -; CHECK-NEXT: adrp x9, .LCPI24_5 -; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI24_4] -; CHECK-NEXT: adrp x8, .LCPI24_7 -; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI24_5] -; CHECK-NEXT: ldr q18, [x10, :lo12:.LCPI24_6] -; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI24_7] -; CHECK-NEXT: tbl v6.16b, { v0.16b, v1.16b }, v6.16b -; CHECK-NEXT: tbl v7.16b, { v2.16b, v3.16b }, v7.16b -; CHECK-NEXT: tbl v16.16b, { v0.16b, v1.16b }, v16.16b -; CHECK-NEXT: tbl v17.16b, { v2.16b, v3.16b }, v17.16b -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b -; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v19.16b -; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0 -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 -; CHECK-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-NEXT: ushll2 v20.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v21.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v17.4s, v17.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v2.2d, v5.2s, #0 -; CHECK-NEXT: ushll v3.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v7.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 -; CHECK-NEXT: ushll v7.2d, v7.2s, #0 -; CHECK-NEXT: ucvtf v18.2d, v20.2d -; CHECK-NEXT: ucvtf v19.2d, v21.2d +; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi d4, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8 +; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s +; CHECK-NEXT: and v17.8b, v6.8b, v4.8b +; CHECK-NEXT: and v18.8b, v7.8b, v4.8b +; CHECK-NEXT: ushr v6.2s, v6.2s, #16 +; CHECK-NEXT: ushr v7.2s, v7.2s, #16 +; CHECK-NEXT: and v21.8b, v0.8b, v4.8b +; CHECK-NEXT: and v22.8b, v2.8b, v4.8b +; CHECK-NEXT: ushr v2.2s, v2.2s, #16 +; CHECK-NEXT: and v19.8b, v16.8b, v4.8b +; CHECK-NEXT: and v20.8b, v5.8b, v4.8b +; CHECK-NEXT: ushll v3.2d, v17.2s, #0 +; CHECK-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ushr v16.2s, v16.2s, #16 +; CHECK-NEXT: ushr v5.2s, v5.2s, #16 ; CHECK-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v17.4s, #0 -; CHECK-NEXT: ushll2 v21.2d, v16.4s, #0 -; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v7.2d, v7.2s, #0 +; CHECK-NEXT: ushll v18.2d, v19.2s, #0 +; CHECK-NEXT: ushll v19.2d, v20.2s, #0 +; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ushll v22.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v23.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ushll v21.2d, v21.2s, #0 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v22.2d, v22.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: ucvtf v21.2d, v21.2d ; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v6.2d, v6.2d +; CHECK-NEXT: and v23.8b, v20.8b, v4.8b +; CHECK-NEXT: and v4.8b, v1.8b, v4.8b +; CHECK-NEXT: ushr v20.2s, v20.2s, #16 +; CHECK-NEXT: ushr v1.2s, v1.2s, #16 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d +; CHECK-NEXT: ucvtf v19.2d, v19.2d ; CHECK-NEXT: ucvtf v16.2d, v16.2d +; CHECK-NEXT: ushll v23.2d, v23.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll v20.2d, v20.2s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ucvtf v21.2d, v21.2d ; CHECK-NEXT: ucvtf v22.2d, v22.2d -; CHECK-NEXT: ucvtf v23.2d, v23.2d ; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v4.2d, v4.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fadd v4.2d, v18.2d, v4.2d -; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d +; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d ; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d -; CHECK-NEXT: fadd v6.2d, v16.2d, v22.2d -; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d -; CHECK-NEXT: fadd v7.2d, v17.2d, v1.2d -; CHECK-NEXT: fadd v1.2d, v21.2d, v0.2d -; CHECK-NEXT: fadd v0.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v3.2d, v4.2d, v16.2d -; CHECK-NEXT: fadd v1.2d, v5.2d, v1.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d +; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d +; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d +; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d +; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d +; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d +; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d +; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d +; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> %z1 = uitofp <8 x i16> %s1 to <8 x double> From a7697c86559e9d57c9c0e2b5f2daaa5cec4e5119 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 09:26:53 +0200 Subject: [PATCH 191/425] [ARM] Do not assume alignment in vld1xN and vst1xN intrinsics (#106984) These intrinsics currently assume natural alignment. Instead, respect the alignment attribute on the intrinsic. Teach InstCombine to improve that alignment. If desired I could also adjust the clang frontend to add alignment annotations equivalent to the previous behavior, but I don't see any indication that such an assumption is correct in the ARM intrinsics docs. Fixes https://github.com/llvm/llvm-project/issues/59081. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 +- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 16 ++ llvm/test/CodeGen/ARM/arm-vld1.ll | 182 ++++++++++------- llvm/test/CodeGen/ARM/arm-vst1.ll | 193 ++++++++++-------- .../test/CodeGen/ARM/bf16-intrinsics-ld-st.ll | 32 +-- .../InstCombine/ARM/neon-intrinsics.ll | 6 +- 6 files changed, 256 insertions(+), 177 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9096617a94855..aa663556deb76 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21073,7 +21073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.arg_size() - 1); Info.offset = 0; - Info.align.reset(); + Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -21120,7 +21120,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align.reset(); + Info.align = I.getParamAlign(0).valueOrOne(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 912569a8fec11..9b5349241c341 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -163,6 +163,22 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::arm_neon_vld1x2: + case Intrinsic::arm_neon_vld1x3: + case Intrinsic::arm_neon_vld1x4: + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: { + Align NewAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + Align OldAlign = II.getParamAlign(0).valueOrOne(); + if (NewAlign > OldAlign) + II.addParamAttr(0, + Attribute::getWithAlignment(II.getContext(), NewAlign)); + break; + } + case Intrinsic::arm_mve_pred_i2v: { Value *Arg = II.getArgOperand(0); Value *ArgArg; diff --git a/llvm/test/CodeGen/ARM/arm-vld1.ll b/llvm/test/CodeGen/ARM/arm-vld1.ll index 78b0b92013c39..ec2793589759e 100644 --- a/llvm/test/CodeGen/ARM/arm-vld1.ll +++ b/llvm/test/CodeGen/ARM/arm-vld1.ll @@ -68,7 +68,7 @@ declare %struct.uint8x16x4_t @llvm.arm.neon.vld1x4.v16i8.p0(ptr) nounwind readon define %struct.uint16x4x2_t @test_vld1_u16_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u16_x2: -; CHECK: vld1.16 {d16, d17}, [r0:64] +; CHECK: vld1.16 {d16, d17}, [r0] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr @@ -76,9 +76,39 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2(ptr %a) nounwind { ret %struct.uint16x4x2_t %tmp } +define %struct.uint16x4x2_t @test_vld1_u16_x2_align8(ptr %a) nounwind { +; CHECK-LABEL: test_vld1_u16_x2_align8: +; CHECK: vld1.16 {d16, d17}, [r0:64] +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 %a) + ret %struct.uint16x4x2_t %tmp +} + +define %struct.uint16x4x2_t @test_vld1_u16_x2_align16(ptr %a) nounwind { +; CHECK-LABEL: test_vld1_u16_x2_align16: +; CHECK: vld1.16 {d16, d17}, [r0:128] +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 %a) + ret %struct.uint16x4x2_t %tmp +} + +define %struct.uint16x4x2_t @test_vld1_u16_x2_align32(ptr %a) nounwind { +; CHECK-LABEL: test_vld1_u16_x2_align32: +; CHECK: vld1.16 {d16, d17}, [r0:128] +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 32 %a) + ret %struct.uint16x4x2_t %tmp +} + define %struct.uint16x4x3_t @test_vld1_u16_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u16_x3: -; CHECK: vld1.16 {d16, d17, d18}, [r1:64] +; CHECK: vld1.16 {d16, d17, d18}, [r1] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! ; CHECK-NEXT: vstr d18, [r0] @@ -89,7 +119,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3(ptr %a) nounwind { define %struct.uint16x4x4_t @test_vld1_u16_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u16_x4: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! ; CHECK-NEXT: vst1.16 {d18}, [r0:64]! @@ -101,7 +131,7 @@ define %struct.uint16x4x4_t @test_vld1_u16_x4(ptr %a) nounwind { define %struct.uint32x2x2_t @test_vld1_u32_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u32_x2: -; CHECK: vld1.32 {d16, d17}, [r0:64] +; CHECK: vld1.32 {d16, d17}, [r0] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr @@ -111,7 +141,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2(ptr %a) nounwind { define %struct.uint32x2x3_t @test_vld1_u32_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u32_x3: -; CHECK: vld1.32 {d16, d17, d18}, [r1:64] +; CHECK: vld1.32 {d16, d17, d18}, [r1] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! ; CHECK-NEXT: vstr d18, [r0] @@ -122,7 +152,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3(ptr %a) nounwind { define %struct.uint32x2x4_t @test_vld1_u32_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u32_x4: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! ; CHECK-NEXT: vst1.32 {d18}, [r0:64]! @@ -134,7 +164,7 @@ define %struct.uint32x2x4_t @test_vld1_u32_x4(ptr %a) nounwind { define %struct.uint64x1x2_t @test_vld1_u64_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u64_x2: -; CHECK: vld1.64 {d16, d17}, [r0:64] +; CHECK: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr @@ -144,7 +174,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2(ptr %a) nounwind { define %struct.uint64x1x3_t @test_vld1_u64_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u64_x3: -; CHECK: vld1.64 {d16, d17, d18}, [r1:64] +; CHECK: vld1.64 {d16, d17, d18}, [r1] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! ; CHECK-NEXT: vstr d18, [r0] @@ -155,7 +185,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3(ptr %a) nounwind { define %struct.uint64x1x4_t @test_vld1_u64_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u64_x4: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! ; CHECK-NEXT: vst1.64 {d18}, [r0:64]! @@ -167,7 +197,7 @@ define %struct.uint64x1x4_t @test_vld1_u64_x4(ptr %a) nounwind { define %struct.uint8x8x2_t @test_vld1_u8_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u8_x2: -; CHECK: vld1.8 {d16, d17}, [r0:64] +; CHECK: vld1.8 {d16, d17}, [r0] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr @@ -177,7 +207,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2(ptr %a) nounwind { define %struct.uint8x8x3_t @test_vld1_u8_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u8_x3: -; CHECK: vld1.8 {d16, d17, d18}, [r1:64] +; CHECK: vld1.8 {d16, d17, d18}, [r1] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! ; CHECK-NEXT: vstr d18, [r0] @@ -188,7 +218,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3(ptr %a) nounwind { define %struct.uint8x8x4_t @test_vld1_u8_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1_u8_x4: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! ; CHECK-NEXT: vst1.8 {d18}, [r0:64]! @@ -200,7 +230,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4(ptr %a) nounwind { define %struct.uint16x8x2_t @test_vld1q_u16_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u16_x2: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] ; CHECK-NEXT: bx lr @@ -210,8 +240,8 @@ define %struct.uint16x8x2_t @test_vld1q_u16_x2(ptr %a) nounwind { define %struct.uint16x8x3_t @test_vld1q_u16_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u16_x3: -; CHECK: vld1.16 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.16 {d19, d20, d21}, [r1:64] +; CHECK: vld1.16 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.16 {d19, d20, d21}, [r1] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.16 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0] @@ -222,8 +252,8 @@ define %struct.uint16x8x3_t @test_vld1q_u16_x3(ptr %a) nounwind { define %struct.uint16x8x4_t @test_vld1q_u16_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u16_x4: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.16 {d20, d21, d22, d23}, [r1:256] +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.16 {d20, d21, d22, d23}, [r1] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.16 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.16 {d20, d21}, [r0]! @@ -235,7 +265,7 @@ define %struct.uint16x8x4_t @test_vld1q_u16_x4(ptr %a) nounwind { define %struct.uint32x4x2_t @test_vld1q_u32_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u32_x2: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] ; CHECK-NEXT: bx lr @@ -245,8 +275,8 @@ define %struct.uint32x4x2_t @test_vld1q_u32_x2(ptr %a) nounwind { define %struct.uint32x4x3_t @test_vld1q_u32_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u32_x3: -; CHECK: vld1.32 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.32 {d19, d20, d21}, [r1:64] +; CHECK: vld1.32 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.32 {d19, d20, d21}, [r1] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.32 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0] @@ -257,8 +287,8 @@ define %struct.uint32x4x3_t @test_vld1q_u32_x3(ptr %a) nounwind { define %struct.uint32x4x4_t @test_vld1q_u32_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u32_x4: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.32 {d20, d21, d22, d23}, [r1:256] +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.32 {d20, d21, d22, d23}, [r1] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.32 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.32 {d20, d21}, [r0]! @@ -270,7 +300,7 @@ define %struct.uint32x4x4_t @test_vld1q_u32_x4(ptr %a) nounwind { define %struct.uint64x2x2_t @test_vld1q_u64_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u64_x2: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] ; CHECK-NEXT: bx lr @@ -280,8 +310,8 @@ define %struct.uint64x2x2_t @test_vld1q_u64_x2(ptr %a) nounwind { define %struct.uint64x2x3_t @test_vld1q_u64_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u64_x3: -; CHECK: vld1.64 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.64 {d19, d20, d21}, [r1:64] +; CHECK: vld1.64 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.64 {d19, d20, d21}, [r1] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0] @@ -292,8 +322,8 @@ define %struct.uint64x2x3_t @test_vld1q_u64_x3(ptr %a) nounwind { define %struct.uint64x2x4_t @test_vld1q_u64_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u64_x4: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.64 {d20, d21, d22, d23}, [r1:256] +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.64 {d20, d21, d22, d23}, [r1] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0]! @@ -305,7 +335,7 @@ define %struct.uint64x2x4_t @test_vld1q_u64_x4(ptr %a) nounwind { define %struct.uint8x16x2_t @test_vld1q_u8_x2(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u8_x2: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256] +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] ; CHECK-NEXT: bx lr @@ -315,8 +345,8 @@ define %struct.uint8x16x2_t @test_vld1q_u8_x2(ptr %a) nounwind { define %struct.uint8x16x3_t @test_vld1q_u8_x3(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u8_x3: -; CHECK: vld1.8 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r1:64] +; CHECK: vld1.8 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r1] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.8 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0] @@ -327,8 +357,8 @@ define %struct.uint8x16x3_t @test_vld1q_u8_x3(ptr %a) nounwind { define %struct.uint8x16x4_t @test_vld1q_u8_x4(ptr %a) nounwind { ; CHECK-LABEL: test_vld1q_u8_x4: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r1:256] +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r1] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.8 {d18, d19}, [r0]! ; CHECK-NEXT: vst1.8 {d20, d21}, [r0]! @@ -344,7 +374,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_imm(ptr %a, ptr %ptr) nounwin ; CHECK-LABEL: test_vld1_u16_x2_post_imm: ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: vld1.16 {d16, d17}, [r0:64]! +; CHECK-NEXT: vld1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -362,7 +392,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_reg(ptr %a, ptr %ptr, i32 %in ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: lsl r2, r2, #1 -; CHECK-NEXT: vld1.16 {d16, d17}, [r0:64], r2 +; CHECK-NEXT: vld1.16 {d16, d17}, [r0], r2 ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -377,7 +407,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint16x4x3_t @test_vld1_u16_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u16_x3_post_imm: -; CHECK: vld1.16 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.16 {d16, d17, d18}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! @@ -392,7 +422,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint16x4x3_t @test_vld1_u16_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u16_x3_post_reg: ; CHECK: lsl r3, r3, #1 -; CHECK-NEXT: vld1.16 {d16, d17, d18}, [r1:64], r3 +; CHECK-NEXT: vld1.16 {d16, d17, d18}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! @@ -406,7 +436,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint16x4x4_t @test_vld1_u16_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u16_x4_post_imm: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! @@ -422,7 +452,7 @@ define %struct.uint16x4x4_t @test_vld1_u16_x4_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint16x4x4_t @test_vld1_u16_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u16_x4_post_reg: ; CHECK: lsl r3, r3, #1 -; CHECK-NEXT: vld1.16 {d16, d17, d18, d19}, [r1:256], r3 +; CHECK-NEXT: vld1.16 {d16, d17, d18, d19}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16}, [r0:64]! ; CHECK-NEXT: vst1.16 {d17}, [r0:64]! @@ -439,7 +469,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_imm(ptr %a, ptr %ptr) nounwin ; CHECK-LABEL: test_vld1_u32_x2_post_imm: ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: vld1.32 {d16, d17}, [r0:64]! +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -457,7 +487,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_reg(ptr %a, ptr %ptr, i32 %in ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: lsl r2, r2, #2 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0:64], r2 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r2 ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -472,7 +502,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint32x2x3_t @test_vld1_u32_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u32_x3_post_imm: -; CHECK: vld1.32 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.32 {d16, d17, d18}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! @@ -487,7 +517,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint32x2x3_t @test_vld1_u32_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u32_x3_post_reg: ; CHECK: lsl r3, r3, #2 -; CHECK-NEXT: vld1.32 {d16, d17, d18}, [r1:64], r3 +; CHECK-NEXT: vld1.32 {d16, d17, d18}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! @@ -501,7 +531,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint32x2x4_t @test_vld1_u32_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u32_x4_post_imm: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! @@ -517,7 +547,7 @@ define %struct.uint32x2x4_t @test_vld1_u32_x4_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint32x2x4_t @test_vld1_u32_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u32_x4_post_reg: ; CHECK: lsl r3, r3, #2 -; CHECK-NEXT: vld1.32 {d16, d17, d18, d19}, [r1:256], r3 +; CHECK-NEXT: vld1.32 {d16, d17, d18, d19}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16}, [r0:64]! ; CHECK-NEXT: vst1.32 {d17}, [r0:64]! @@ -534,7 +564,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_imm(ptr %a, ptr %ptr) nounwin ; CHECK-LABEL: test_vld1_u64_x2_post_imm: ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:64]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -552,7 +582,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_reg(ptr %a, ptr %ptr, i32 %in ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: lsl r2, r2, #3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:64], r2 +; CHECK-NEXT: vld1.64 {d16, d17}, [r0], r2 ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -567,7 +597,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint64x1x3_t @test_vld1_u64_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u64_x3_post_imm: -; CHECK: vld1.64 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.64 {d16, d17, d18}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! @@ -582,7 +612,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint64x1x3_t @test_vld1_u64_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u64_x3_post_reg: ; CHECK: lsl r3, r3, #3 -; CHECK-NEXT: vld1.64 {d16, d17, d18}, [r1:64], r3 +; CHECK-NEXT: vld1.64 {d16, d17, d18}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! @@ -596,7 +626,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3_post_reg(ptr %a, ptr %ptr, i32 %in define %struct.uint64x1x4_t @test_vld1_u64_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u64_x4_post_imm: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! @@ -612,7 +642,7 @@ define %struct.uint64x1x4_t @test_vld1_u64_x4_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint64x1x4_t @test_vld1_u64_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u64_x4_post_reg: ; CHECK: lsl r3, r3, #3 -; CHECK-NEXT: vld1.64 {d16, d17, d18, d19}, [r1:256], r3 +; CHECK-NEXT: vld1.64 {d16, d17, d18, d19}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16}, [r0:64]! ; CHECK-NEXT: vst1.64 {d17}, [r0:64]! @@ -629,7 +659,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_imm(ptr %a, ptr %ptr) nounwind ; CHECK-LABEL: test_vld1_u8_x2_post_imm: ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: vld1.8 {d16, d17}, [r0:64]! +; CHECK-NEXT: vld1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -646,7 +676,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_reg(ptr %a, ptr %ptr, i32 %inc) ; CHECK-LABEL: test_vld1_u8_x2_post_reg: ; CHECK: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: vld1.8 {d16, d17}, [r0:64], r2 +; CHECK-NEXT: vld1.8 {d16, d17}, [r0], r2 ; CHECK-NEXT: vmov lr, r12, d16 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: vmov r2, r3, d17 @@ -661,7 +691,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_reg(ptr %a, ptr %ptr, i32 %inc) define %struct.uint8x8x3_t @test_vld1_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u8_x3_post_imm: -; CHECK: vld1.8 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.8 {d16, d17, d18}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! @@ -675,7 +705,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind define %struct.uint8x8x3_t @test_vld1_u8_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u8_x3_post_reg: -; CHECK: vld1.8 {d16, d17, d18}, [r1:64], r3 +; CHECK: vld1.8 {d16, d17, d18}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! @@ -689,7 +719,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) define %struct.uint8x8x4_t @test_vld1_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1_u8_x4_post_imm: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! @@ -704,7 +734,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind define %struct.uint8x8x4_t @test_vld1_u8_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind { ; CHECK-LABEL: test_vld1_u8_x4_post_reg: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256], r3 +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1], r3 ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! ; CHECK-NEXT: vst1.8 {d17}, [r0:64]! @@ -719,7 +749,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) define %struct.uint16x8x2_t @test_vld1q_u16_x2_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u16_x2_post_imm: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] @@ -732,8 +762,8 @@ define %struct.uint16x8x2_t @test_vld1q_u16_x2_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint16x8x3_t @test_vld1q_u16_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u16_x3_post_imm: -; CHECK: vld1.16 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.16 {d19, d20, d21}, [r1:64]! +; CHECK: vld1.16 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.16 {d19, d20, d21}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.16 {d18, d19}, [r0]! @@ -747,8 +777,8 @@ define %struct.uint16x8x3_t @test_vld1q_u16_x3_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint16x8x4_t @test_vld1q_u16_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u16_x4_post_imm: -; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.16 {d20, d21, d22, d23}, [r1:256]! +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.16 {d20, d21, d22, d23}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.16 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.16 {d18, d19}, [r0]! @@ -763,7 +793,7 @@ define %struct.uint16x8x4_t @test_vld1q_u16_x4_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint32x4x2_t @test_vld1q_u32_x2_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u32_x2_post_imm: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] @@ -776,8 +806,8 @@ define %struct.uint32x4x2_t @test_vld1q_u32_x2_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint32x4x3_t @test_vld1q_u32_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u32_x3_post_imm: -; CHECK: vld1.32 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.32 {d19, d20, d21}, [r1:64]! +; CHECK: vld1.32 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.32 {d19, d20, d21}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.32 {d18, d19}, [r0]! @@ -791,8 +821,8 @@ define %struct.uint32x4x3_t @test_vld1q_u32_x3_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint32x4x4_t @test_vld1q_u32_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u32_x4_post_imm: -; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.32 {d20, d21, d22, d23}, [r1:256]! +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.32 {d20, d21, d22, d23}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.32 {d18, d19}, [r0]! @@ -807,7 +837,7 @@ define %struct.uint32x4x4_t @test_vld1q_u32_x4_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint64x2x2_t @test_vld1q_u64_x2_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u64_x2_post_imm: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] @@ -820,8 +850,8 @@ define %struct.uint64x2x2_t @test_vld1q_u64_x2_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint64x2x3_t @test_vld1q_u64_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u64_x3_post_imm: -; CHECK: vld1.64 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.64 {d19, d20, d21}, [r1:64]! +; CHECK: vld1.64 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.64 {d19, d20, d21}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0]! @@ -835,8 +865,8 @@ define %struct.uint64x2x3_t @test_vld1q_u64_x3_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint64x2x4_t @test_vld1q_u64_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u64_x4_post_imm: -; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.64 {d20, d21, d22, d23}, [r1:256]! +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.64 {d20, d21, d22, d23}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0]! @@ -851,7 +881,7 @@ define %struct.uint64x2x4_t @test_vld1q_u64_x4_post_imm(ptr %a, ptr %ptr) nounwi define %struct.uint8x16x2_t @test_vld1q_u8_x2_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u8_x2_post_imm: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0] @@ -864,8 +894,8 @@ define %struct.uint8x16x2_t @test_vld1q_u8_x2_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint8x16x3_t @test_vld1q_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u8_x3_post_imm: -; CHECK: vld1.8 {d16, d17, d18}, [r1:64]! -; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r1:64]! +; CHECK: vld1.8 {d16, d17, d18}, [r1]! +; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.8 {d18, d19}, [r0]! @@ -879,8 +909,8 @@ define %struct.uint8x16x3_t @test_vld1q_u8_x3_post_imm(ptr %a, ptr %ptr) nounwin define %struct.uint8x16x4_t @test_vld1q_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind { ; CHECK-LABEL: test_vld1q_u8_x4_post_imm: -; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256]! -; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r1:256]! +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1]! +; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r1]! ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vst1.8 {d18, d19}, [r0]! diff --git a/llvm/test/CodeGen/ARM/arm-vst1.ll b/llvm/test/CodeGen/ARM/arm-vst1.ll index 7dacd8b0b99f9..5d0a7e9614ce9 100644 --- a/llvm/test/CodeGen/ARM/arm-vst1.ll +++ b/llvm/test/CodeGen/ARM/arm-vst1.ll @@ -92,7 +92,7 @@ declare void @llvm.arm.neon.vst1x4.p0.v16i8(ptr nocapture, <16 x i8>, <16 x i8>, define arm_aapcs_vfpcc void @test_vst1_u16_x2(ptr %a, %struct.uint16x4x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x2: -; CHECK: vst1.16 {d0, d1}, [r0:64] +; CHECK: vst1.16 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 @@ -101,9 +101,42 @@ entry: ret void } +define arm_aapcs_vfpcc void @test_vst1_u16_x2_align8(ptr %a, %struct.uint16x4x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x2_align8: +; CHECK: vst1.16 {d0, d1}, [r0:64] +; CHECK-NEXT: bx lr +entry: + %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 8 %a, <4 x i16> %b0, <4 x i16> %b1) + ret void +} + +define arm_aapcs_vfpcc void @test_vst1_u16_x2_align16(ptr %a, %struct.uint16x4x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x2_align16: +; CHECK: vst1.16 {d0, d1}, [r0:128] +; CHECK-NEXT: bx lr +entry: + %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 16 %a, <4 x i16> %b0, <4 x i16> %b1) + ret void +} + +define arm_aapcs_vfpcc void @test_vst1_u16_x2_align32(ptr %a, %struct.uint16x4x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x2_align32: +; CHECK: vst1.16 {d0, d1}, [r0:128] +; CHECK-NEXT: bx lr +entry: + %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 32 %a, <4 x i16> %b0, <4 x i16> %b1) + ret void +} + define arm_aapcs_vfpcc void @test_vst1_u16_x3(ptr %a, %struct.uint16x4x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x3: -; CHECK: vst1.16 {d0, d1, d2}, [r0:64] +; CHECK: vst1.16 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 @@ -115,7 +148,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u16_x4(ptr %a, %struct.uint16x4x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x4: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 @@ -128,7 +161,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u32_x2(ptr %a, %struct.uint32x2x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x2: -; CHECK: vst1.32 {d0, d1}, [r0:64] +; CHECK: vst1.32 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 @@ -139,7 +172,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u32_x3(ptr %a, %struct.uint32x2x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x3: -; CHECK: vst1.32 {d0, d1, d2}, [r0:64] +; CHECK: vst1.32 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 @@ -151,7 +184,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u32_x4(ptr %a, %struct.uint32x2x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x4: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 @@ -164,7 +197,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u64_x2(ptr %a, %struct.uint64x1x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x2: -; CHECK: vst1.64 {d0, d1}, [r0:64] +; CHECK: vst1.64 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 @@ -175,7 +208,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u64_x3(ptr %a, %struct.uint64x1x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x3: -; CHECK: vst1.64 {d0, d1, d2}, [r0:64] +; CHECK: vst1.64 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 @@ -187,7 +220,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u64_x4(ptr %a, %struct.uint64x1x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x4: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 @@ -200,7 +233,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u8_x2(ptr %a, %struct.uint8x8x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x2: -; CHECK: vst1.8 {d0, d1}, [r0:64] +; CHECK: vst1.8 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 @@ -211,7 +244,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u8_x3(ptr %a, %struct.uint8x8x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x3: -; CHECK: vst1.8 {d0, d1, d2}, [r0:64] +; CHECK: vst1.8 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 @@ -223,7 +256,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1_u8_x4(ptr %a, %struct.uint8x8x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x4: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 @@ -236,7 +269,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u16_x2(ptr %a, %struct.uint16x8x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x2: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 @@ -247,8 +280,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u16_x3(ptr %a, %struct.uint16x8x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x3: -; CHECK: vst1.16 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64] +; CHECK: vst1.16 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0 @@ -260,8 +293,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u16_x4(ptr %a, %struct.uint16x8x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x4: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256] +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0 @@ -274,7 +307,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u32_x2(ptr %a, %struct.uint32x4x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x2: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 @@ -285,8 +318,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u32_x3(ptr %a, %struct.uint32x4x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x3: -; CHECK: vst1.32 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.32 {d3, d4, d5}, [r0:64] +; CHECK: vst1.32 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.32 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0 @@ -298,8 +331,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u32_x4(ptr %a, %struct.uint32x4x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x4: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.32 {d4, d5, d6, d7}, [r0:256] +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.32 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0 @@ -312,7 +345,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u64_x2(ptr %a, %struct.uint64x2x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x2: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 @@ -323,8 +356,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u64_x3(ptr %a, %struct.uint64x2x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x3: -; CHECK: vst1.64 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.64 {d3, d4, d5}, [r0:64] +; CHECK: vst1.64 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.64 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0 @@ -336,8 +369,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u64_x4(ptr %a, %struct.uint64x2x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x4: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.64 {d4, d5, d6, d7}, [r0:256] +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.64 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0 @@ -350,7 +383,7 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u8_x2(ptr %a, %struct.uint8x16x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x2: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256] +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 @@ -361,8 +394,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u8_x3(ptr %a, %struct.uint8x16x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x3: -; CHECK: vst1.8 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.8 {d3, d4, d5}, [r0:64] +; CHECK: vst1.8 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.8 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0 @@ -374,8 +407,8 @@ entry: define arm_aapcs_vfpcc void @test_vst1q_u8_x4(ptr %a, %struct.uint8x16x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x4: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.8 {d4, d5, d6, d7}, [r0:256] +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.8 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0 @@ -390,7 +423,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x2_post_imm(ptr %a, %struct.uint8x8x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x2_post_imm: -; CHECK: vst1.8 {d0, d1}, [r0:64]! +; CHECK: vst1.8 {d0, d1}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 @@ -402,7 +435,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x2_post_reg(ptr %a, %struct.uint8x8x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u8_x2_post_reg: -; CHECK: vst1.8 {d0, d1}, [r0:64], r1 +; CHECK: vst1.8 {d0, d1}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 @@ -414,7 +447,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_imm(ptr %a, %struct.uint16x4x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x2_post_imm: -; CHECK: vst1.16 {d0, d1}, [r0:64]! +; CHECK: vst1.16 {d0, d1}, [r0]! ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 @@ -426,7 +459,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_imm(ptr %a, %struct.uint16x4x2 define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_reg(ptr %a, %struct.uint16x4x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u16_x2_post_reg: ; CHECK: lsl r1, r1, #1 -; CHECK-NEXT: vst1.16 {d0, d1}, [r0:64], r1 +; CHECK-NEXT: vst1.16 {d0, d1}, [r0], r1 ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 @@ -437,7 +470,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_reg(ptr %a, %struct.uint16x4x2 define arm_aapcs_vfpcc ptr @test_vst1_u32_x2_post_imm(ptr %a, %struct.uint32x2x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x2_post_imm: -; CHECK: vst1.32 {d0, d1}, [r0:64]! +; CHECK: vst1.32 {d0, d1}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 @@ -450,7 +483,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u32_x2_post_reg(ptr %a, %struct.uint32x2x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u32_x2_post_reg: ; CHECK: lsl r1, r1, #2 -; CHECK-NEXT: vst1.32 {d0, d1}, [r0:64], r1 +; CHECK-NEXT: vst1.32 {d0, d1}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 @@ -462,7 +495,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x2_post_imm(ptr %a, %struct.uint64x1x2_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x2_post_imm: -; CHECK: vst1.64 {d0, d1}, [r0:64]! +; CHECK: vst1.64 {d0, d1}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 @@ -475,7 +508,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x2_post_reg(ptr %a, %struct.uint64x1x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u64_x2_post_reg: ; CHECK: lsl r1, r1, #3 -; CHECK-NEXT: vst1.64 {d0, d1}, [r0:64], r1 +; CHECK-NEXT: vst1.64 {d0, d1}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 @@ -487,7 +520,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u8_x2_post_imm(ptr %a, %struct.uint8x16x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x2_post_imm: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 @@ -499,7 +532,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u8_x2_post_reg(ptr %a, %struct.uint8x16x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1q_u8_x2_post_reg: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 @@ -511,7 +544,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u16_x2_post_imm(ptr %a, %struct.uint16x8x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x2_post_imm: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 @@ -524,7 +557,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u16_x2_post_reg(ptr %a, %struct.uint16x8x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1q_u16_x2_post_reg: ; CHECK: lsl r1, r1, #1 -; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 @@ -536,7 +569,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u32_x2_post_imm(ptr %a, %struct.uint32x4x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x2_post_imm: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 @@ -549,7 +582,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u32_x2_post_reg(ptr %a, %struct.uint32x4x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1q_u32_x2_post_reg: ; CHECK: lsl r1, r1, #2 -; CHECK-NEXT: vst1.32 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.32 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 @@ -561,7 +594,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u64_x2_post_imm(ptr %a, %struct.uint64x2x2_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x2_post_imm: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 @@ -574,7 +607,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u64_x2_post_reg(ptr %a, %struct.uint64x2x2_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1q_u64_x2_post_reg: ; CHECK: lsl r1, r1, #3 -; CHECK-NEXT: vst1.64 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.64 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 @@ -587,7 +620,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x3_post_imm(ptr %a, %struct.uint8x8x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x3_post_imm: -; CHECK: vst1.8 {d0, d1, d2}, [r0:64]! +; CHECK: vst1.8 {d0, d1, d2}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 @@ -600,7 +633,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x3_post_reg(ptr %a, %struct.uint8x8x3_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u8_x3_post_reg: -; CHECK: vst1.8 {d0, d1, d2}, [r0:64], r1 +; CHECK: vst1.8 {d0, d1, d2}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 @@ -613,7 +646,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_imm(ptr %a, %struct.uint16x4x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x3_post_imm: -; CHECK: vst1.16 {d0, d1, d2}, [r0:64]! +; CHECK: vst1.16 {d0, d1, d2}, [r0]! ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1 @@ -626,7 +659,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_imm(ptr %a, %struct.uint16x4x3 define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_reg(ptr %a, %struct.uint16x4x3_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u16_x3_post_reg: ; CHECK: lsl r1, r1, #1 -; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64], r1 +; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0], r1 ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1 @@ -638,7 +671,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_reg(ptr %a, %struct.uint16x4x3 define arm_aapcs_vfpcc ptr @test_vst1_u32_x3_post_imm(ptr %a, %struct.uint32x2x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x3_post_imm: -; CHECK: vst1.32 {d0, d1, d2}, [r0:64]! +; CHECK: vst1.32 {d0, d1, d2}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 @@ -652,7 +685,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u32_x3_post_reg(ptr %a, %struct.uint32x2x3_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u32_x3_post_reg: ; CHECK: lsl r1, r1, #2 -; CHECK-NEXT: vst1.32 {d0, d1, d2}, [r0:64], r1 +; CHECK-NEXT: vst1.32 {d0, d1, d2}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 @@ -665,7 +698,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x3_post_imm(ptr %a, %struct.uint64x1x3_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x3_post_imm: -; CHECK: vst1.64 {d0, d1, d2}, [r0:64]! +; CHECK: vst1.64 {d0, d1, d2}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 @@ -679,7 +712,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x3_post_reg(ptr %a, %struct.uint64x1x3_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u64_x3_post_reg: ; CHECK: lsl r1, r1, #3 -; CHECK-NEXT: vst1.64 {d0, d1, d2}, [r0:64], r1 +; CHECK-NEXT: vst1.64 {d0, d1, d2}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 @@ -692,8 +725,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u8_x3_post_imm(ptr %a, %struct.uint8x16x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x3_post_imm: -; CHECK: vst1.8 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.8 {d3, d4, d5}, [r0:64]! +; CHECK: vst1.8 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.8 {d3, d4, d5}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0 @@ -706,8 +739,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u16_x3_post_imm(ptr %a, %struct.uint16x8x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x3_post_imm: -; CHECK: vst1.16 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64]! +; CHECK: vst1.16 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0 @@ -720,8 +753,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u32_x3_post_imm(ptr %a, %struct.uint32x4x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x3_post_imm: -; CHECK: vst1.32 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.32 {d3, d4, d5}, [r0:64]! +; CHECK: vst1.32 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.32 {d3, d4, d5}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0 @@ -734,8 +767,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u64_x3_post_imm(ptr %a, %struct.uint64x2x3_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x3_post_imm: -; CHECK: vst1.64 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.64 {d3, d4, d5}, [r0:64]! +; CHECK: vst1.64 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.64 {d3, d4, d5}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0 @@ -748,7 +781,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x4_post_imm(ptr %a, %struct.uint8x8x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u8_x4_post_imm: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 @@ -762,7 +795,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u8_x4_post_reg(ptr %a, %struct.uint8x8x4_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u8_x4_post_reg: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 @@ -776,7 +809,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_imm(ptr %a, %struct.uint16x4x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u16_x4_post_imm: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1 @@ -790,7 +823,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_imm(ptr %a, %struct.uint16x4x4 define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_reg(ptr %a, %struct.uint16x4x4_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u16_x4_post_reg: ; CHECK: lsl r1, r1, #1 -; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1 @@ -803,7 +836,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_reg(ptr %a, %struct.uint16x4x4 define arm_aapcs_vfpcc ptr @test_vst1_u32_x4_post_imm(ptr %a, %struct.uint32x2x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u32_x4_post_imm: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 @@ -818,7 +851,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u32_x4_post_reg(ptr %a, %struct.uint32x2x4_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u32_x4_post_reg: ; CHECK: lsl r1, r1, #2 -; CHECK-NEXT: vst1.32 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.32 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 @@ -832,7 +865,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x4_post_imm(ptr %a, %struct.uint64x1x4_t %b) nounwind { ; CHECK-LABEL: test_vst1_u64_x4_post_imm: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256]! +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 @@ -847,7 +880,7 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1_u64_x4_post_reg(ptr %a, %struct.uint64x1x4_t %b, i32 %inc) nounwind { ; CHECK-LABEL: test_vst1_u64_x4_post_reg: ; CHECK: lsl r1, r1, #3 -; CHECK-NEXT: vst1.64 {d0, d1, d2, d3}, [r0:256], r1 +; CHECK-NEXT: vst1.64 {d0, d1, d2, d3}, [r0], r1 ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 @@ -861,8 +894,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u8_x4_post_imm(ptr %a, %struct.uint8x16x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u8_x4_post_imm: -; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.8 {d4, d5, d6, d7}, [r0:256]! +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.8 {d4, d5, d6, d7}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0 @@ -876,8 +909,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u16_x4_post_imm(ptr %a, %struct.uint16x8x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u16_x4_post_imm: -; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256]! +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0 @@ -891,8 +924,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u32_x4_post_imm(ptr %a, %struct.uint32x4x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u32_x4_post_imm: -; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.32 {d4, d5, d6, d7}, [r0:256]! +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.32 {d4, d5, d6, d7}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0 @@ -906,8 +939,8 @@ entry: define arm_aapcs_vfpcc ptr @test_vst1q_u64_x4_post_imm(ptr %a, %struct.uint64x2x4_t %b) nounwind { ; CHECK-LABEL: test_vst1q_u64_x4_post_imm: -; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.64 {d4, d5, d6, d7}, [r0:256]! +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.64 {d4, d5, d6, d7}, [r0]! ; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0 diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll index e49128f53b115..846cf239e8987 100644 --- a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll +++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll @@ -60,7 +60,7 @@ entry: define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(ptr %ptr) { ; CHECK-LABEL: test_vld1_bf16_x2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1}, [r0:64] +; CHECK-NEXT: vld1.16 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr %ptr) @@ -76,7 +76,7 @@ entry: define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(ptr %ptr) { ; CHECK-LABEL: test_vld1q_bf16_x2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr %ptr) @@ -92,7 +92,7 @@ entry: define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(ptr %ptr) { ; CHECK-LABEL: test_vld1_bf16_x3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr %ptr) @@ -111,8 +111,8 @@ entry: define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(ptr %ptr) { ; CHECK-LABEL: test_vld1q_bf16_x3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr %ptr) @@ -131,7 +131,7 @@ entry: define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(ptr %ptr) { ; CHECK-LABEL: test_vld1_bf16_x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr %ptr) @@ -153,8 +153,8 @@ entry: define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(ptr %ptr) { ; CHECK-LABEL: test_vld1q_bf16_x4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr %ptr) @@ -635,7 +635,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <2 x i32 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-NEXT: vst1.16 {d0, d1}, [r0:64] +; CHECK-NEXT: vst1.16 {d0, d1}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 @@ -651,7 +651,7 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <4 x i3 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 @@ -668,7 +668,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <2 x i32 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 @@ -687,8 +687,8 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <4 x i3 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]! -; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0]! +; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 @@ -708,7 +708,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <2 x i32 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 @@ -730,8 +730,8 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <4 x i3 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]! -; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0]! +; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0] ; CHECK-NEXT: bx lr entry: %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 diff --git a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll index 68dda24b53747..b0435f2a276f1 100644 --- a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll @@ -29,7 +29,7 @@ define void @test() { define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align(ptr align 16 %a) { ; CHECK-LABEL: define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align( ; CHECK-SAME: ptr align 16 [[A:%.*]]) { -; CHECK-NEXT: [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr [[A]]) +; CHECK-NEXT: [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 [[A]]) ; CHECK-NEXT: ret { <4 x i16>, <4 x i16> } [[TMP]] ; %tmp = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr %a) @@ -39,7 +39,7 @@ define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align(ptr align 16 %a) { define { <4 x i16>, <4 x i16> } @test_vld1x2_lower_align(ptr align 16 %a) { ; CHECK-LABEL: define { <4 x i16>, <4 x i16> } @test_vld1x2_lower_align( ; CHECK-SAME: ptr align 16 [[A:%.*]]) { -; CHECK-NEXT: [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 [[A]]) +; CHECK-NEXT: [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 [[A]]) ; CHECK-NEXT: ret { <4 x i16>, <4 x i16> } [[TMP]] ; %tmp = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 %a) @@ -59,7 +59,7 @@ define { <4 x i16>, <4 x i16> } @test_vld1x2_higher_align(ptr align 8 %a) { define void @test_vst1x2_no_align(ptr align 16 %a, <4 x i16> %b0, <4 x i16> %b1) { ; CHECK-LABEL: define void @test_vst1x2_no_align( ; CHECK-SAME: ptr align 16 [[A:%.*]], <4 x i16> [[B0:%.*]], <4 x i16> [[B1:%.*]]) { -; CHECK-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[B0]], <4 x i16> [[B1]]) +; CHECK-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 16 [[A]], <4 x i16> [[B0]], <4 x i16> [[B1]]) ; CHECK-NEXT: ret void ; call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> %b0, <4 x i16> %b1) From 3d729571fdc86a40218e5743d4386d7d8edc36ae Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 5 Sep 2024 15:27:48 +0800 Subject: [PATCH 192/425] [RISCV] Model dest EEW and fix peepholes not checking EEW (#105945) Previously for vector peepholes that fold based on VL, we checked if the VLMAX is the same as a proxy to check that the EEWs were the same. This only worked at LMUL >= 1 because the EMULs of the Src output and user's input had to be the same because the register classes needed to match. At fractional LMULs we would have incorrectly folded something like this: %x:vr = PseudoVADD_VV_MF4 $noreg, $noreg, $noreg, 4, 4 /* e16 */, 0 %y:vr = PseudoVMV_V_V_MF8 $noreg, %x, 4, 3 /* e8 */, 0 This models the EEW of the destination operands of vector instructions with a TSFlag, which is enough to fix the incorrect folding. There's some overlap with the TargetOverlapConstraintType and IsRVVWideningReduction. If we model the source operands as well we may be able to subsume them. --- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 8 +++ llvm/lib/Target/RISCV/RISCVInstrFormats.td | 12 ++++ llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 12 ++++ llvm/lib/Target/RISCV/RISCVInstrInfo.h | 4 ++ llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 57 +++++++++++-------- llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 9 ++- llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td | 5 +- llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td | 3 +- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 27 +++++---- .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 16 ++++++ .../Target/RISCV/RISCVInstrInfoTest.cpp | 21 +++++++ 11 files changed, 133 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 7a0b35c1afce2..cf3ea3e4ea213 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -129,6 +129,14 @@ enum { ElementsDependOnMaskShift = ElementsDependOnVLShift + 1, ElementsDependOnMaskMask = 1ULL << ElementsDependOnMaskShift, + + // Indicates the EEW of a vector instruction's destination operand. + // 0 -> 1 + // 1 -> SEW + // 2 -> SEW * 2 + // 3 -> SEW * 4 + DestEEWShift = ElementsDependOnMaskShift + 1, + DestEEWMask = 3ULL << DestEEWShift, }; // Helper functions to read TSFlags. diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index a389320adc876..fcea18f81b390 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -167,6 +167,14 @@ def EltDepsNone : EltDeps; def EltDepsVL : EltDeps; def EltDepsVLMask : EltDeps; +class EEW val> { + bits<2> Value = val; +} +def EEW1 : EEW<0>; +def EEWSEWx1 : EEW<1>; +def EEWSEWx2 : EEW<2>; +def EEWSEWx4 : EEW<3>; + class RVInstCommon pattern, InstFormat format> : Instruction { let Namespace = "RISCV"; @@ -240,6 +248,10 @@ class RVInstCommonBaseInstr; } + +unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) { + unsigned DestEEW = + (Desc.TSFlags & RISCVII::DestEEWMask) >> RISCVII::DestEEWShift; + // EEW = 1 + if (DestEEW == 0) + return 0; + // EEW = SEW * n + unsigned Scaled = Log2SEW + (DestEEW - 1); + assert(Scaled >= 3 && Scaled <= 6); + return Scaled; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 8494110adffb9..e040891539ddf 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -354,6 +354,10 @@ std::optional getVectorLowDemandedScalarBits(uint16_t Opcode, // Returns the MC opcode of RVV pseudo instruction. unsigned getRVVMCOpcode(unsigned RVVPseudoOpcode); +// For a (non-pseudo) RVV instruction \p Desc and the given \p Log2SEW, returns +// the log2 EEW of the destination operand. +unsigned getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW); + // Special immediate for AVL operand of V pseudo instructions to indicate VLMax. static constexpr int64_t VLMaxSentinel = -1LL; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 738bb5d9bd65b..6f7d14d5503bd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -1104,7 +1104,7 @@ def : InstAlias<"vneg.v $vd, $vs", (VRSUB_VX VR:$vd, VR:$vs, X0, zero_reg)>; // The destination vector register group cannot overlap a source vector // register group of a different element width (including the mask register // if masked), otherwise an illegal instruction exception is raised. -let Constraints = "@earlyclobber $vd" in { +let Constraints = "@earlyclobber $vd", DestEEW = EEWSEWx2 in { let RVVConstraint = WidenV in { defm VWADDU_V : VALU_MV_V_X<"vwaddu", 0b110000, "v">; defm VWSUBU_V : VALU_MV_V_X<"vwsubu", 0b110010, "v">; @@ -1121,7 +1121,7 @@ defm VWSUBU_W : VALU_MV_V_X<"vwsubu", 0b110110, "w">; defm VWADD_W : VALU_MV_V_X<"vwadd", 0b110101, "w">; defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">; } // RVVConstraint = WidenW -} // Constraints = "@earlyclobber $vd" +} // Constraints = "@earlyclobber $vd", DestEEW = EEWSEWx2 def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm", (VWADD_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>; @@ -1147,10 +1147,11 @@ defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>; defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>; } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>; -let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in { +let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, + DestEEW = EEW1 in { defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>; defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint +} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, DestEEW = EEW1 // Vector Bitwise Logical Instructions defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>; @@ -1183,7 +1184,7 @@ def : InstAlias<"vncvt.x.x.w $vd, $vs", (VNSRL_WX VR:$vd, VR:$vs, X0, zero_reg)>; // Vector Integer Comparison Instructions -let RVVConstraint = NoConstraint in { +let RVVConstraint = NoConstraint, DestEEW = EEW1 in { defm VMSEQ_V : VCMP_IV_V_X_I<"vmseq", 0b011000>; defm VMSNE_V : VCMP_IV_V_X_I<"vmsne", 0b011001>; defm VMSLTU_V : VCMP_IV_V_X<"vmsltu", 0b011010>; @@ -1192,7 +1193,7 @@ defm VMSLEU_V : VCMP_IV_V_X_I<"vmsleu", 0b011100>; defm VMSLE_V : VCMP_IV_V_X_I<"vmsle", 0b011101>; defm VMSGTU_V : VCMP_IV_X_I<"vmsgtu", 0b011110>; defm VMSGT_V : VCMP_IV_X_I<"vmsgt", 0b011111>; -} // RVVConstraint = NoConstraint +} // RVVConstraint = NoConstraint, DestEEW = EEW1 def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm", (VMSLTU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>; @@ -1204,7 +1205,7 @@ def : InstAlias<"vmsge.vv $vd, $va, $vb$vm", (VMSLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>; let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0, - mayStore = 0 in { + mayStore = 0, DestEEW = EEW1 in { // For unsigned comparisons we need to special case 0 immediate to maintain // the always true/false semantics we would invert if we just decremented the // immediate like we do for signed. To match the GNU assembler we will use @@ -1227,7 +1228,7 @@ def PseudoVMSLT_VI : Pseudo<(outs VR:$vd), } let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0, - mayStore = 0 in { + mayStore = 0, DestEEW = EEW1 in { def PseudoVMSGEU_VX : Pseudo<(outs VR:$vd), (ins VR:$vs2, GPR:$rs1), [], "vmsgeu.vx", "$vd, $vs2, $rs1">; @@ -1267,11 +1268,12 @@ defm VREMU_V : VDIV_MV_V_X<"vremu", 0b100010>; defm VREM_V : VDIV_MV_V_X<"vrem", 0b100011>; // Vector Widening Integer Multiply Instructions -let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in { +let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, + DestEEW = EEWSEWx2 in { defm VWMUL_V : VWMUL_MV_V_X<"vwmul", 0b111011>; defm VWMULU_V : VWMUL_MV_V_X<"vwmulu", 0b111000>; defm VWMULSU_V : VWMUL_MV_V_X<"vwmulsu", 0b111010>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV +} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, DestEEW = EEWSEWx2 // Vector Single-Width Integer Multiply-Add Instructions defm VMACC_V : VMAC_MV_V_X<"vmacc", 0b101101>; @@ -1280,10 +1282,12 @@ defm VMADD_V : VMAC_MV_V_X<"vmadd", 0b101001>; defm VNMSUB_V : VMAC_MV_V_X<"vnmsub", 0b101011>; // Vector Widening Integer Multiply-Add Instructions +let DestEEW = EEWSEWx2 in { defm VWMACCU_V : VWMAC_MV_V_X<"vwmaccu", 0b111100>; defm VWMACC_V : VWMAC_MV_V_X<"vwmacc", 0b111101>; defm VWMACCSU_V : VWMAC_MV_V_X<"vwmaccsu", 0b111111>; defm VWMACCUS_V : VWMAC_MV_X<"vwmaccus", 0b111110>; +} // DestEEW = EEWSEWx2 // Vector Integer Merge Instructions defm VMERGE_V : VMRG_IV_V_X_I<"vmerge", 0b010111>; @@ -1342,7 +1346,8 @@ defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>; // Vector Widening Floating-Point Add/Subtract Instructions let Constraints = "@earlyclobber $vd", Uses = [FRM], - mayRaiseFPException = true in { + mayRaiseFPException = true, + DestEEW = EEWSEWx2 in { let RVVConstraint = WidenV in { defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000, "v">; defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010, "v">; @@ -1355,7 +1360,7 @@ let RVVConstraint = WidenW in { defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">; defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">; } // RVVConstraint = WidenW -} // Constraints = "@earlyclobber $vd", Uses = [FRM], mayRaiseFPException = true +} // Constraints = "@earlyclobber $vd", Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 // Vector Single-Width Floating-Point Multiply/Divide Instructions let Uses = [FRM], mayRaiseFPException = true in { @@ -1366,9 +1371,9 @@ defm VFRDIV_V : VDIV_FV_F<"vfrdiv", 0b100001>; // Vector Widening Floating-Point Multiply let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, - Uses = [FRM], mayRaiseFPException = true in { + Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 in { defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true +} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 // Vector Single-Width Floating-Point Fused Multiply-Add Instructions let Uses = [FRM], mayRaiseFPException = true in { @@ -1383,12 +1388,12 @@ defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>; } // Vector Widening Floating-Point Fused Multiply-Add Instructions -let Uses = [FRM], mayRaiseFPException = true in { +let Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 in { defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>; defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>; defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>; defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true +} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 // Vector Floating-Point Square-Root Instruction let Uses = [FRM], mayRaiseFPException = true in { @@ -1420,14 +1425,14 @@ def : InstAlias<"vfabs.v $vd, $vs", (VFSGNJX_VV VR:$vd, VR:$vs, VR:$vs, zero_reg)>; // Vector Floating-Point Compare Instructions -let RVVConstraint = NoConstraint, mayRaiseFPException = true in { +let RVVConstraint = NoConstraint, mayRaiseFPException = true, DestEEW = EEW1 in { defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>; defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>; defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>; defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>; defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>; defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>; -} // RVVConstraint = NoConstraint, mayRaiseFPException = true +} // RVVConstraint = NoConstraint, mayRaiseFPException = true, DestEEW = EEW1 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm", (VMFLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>; @@ -1471,7 +1476,7 @@ defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>; // Widening Floating-Point/Integer Type-Convert Instructions let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt, - mayRaiseFPException = true in { + mayRaiseFPException = true, DestEEW = EEWSEWx2 in { let Uses = [FRM] in { defm VFWCVT_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>; defm VFWCVT_X_F_V : VWCVTI_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>; @@ -1481,7 +1486,7 @@ defm VFWCVT_RTZ_X_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>; defm VFWCVT_F_XU_V : VWCVTF_IV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>; defm VFWCVT_F_X_V : VWCVTF_IV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>; defm VFWCVT_F_F_V : VWCVTF_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt +} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt, DestEEW = EEWSEWx2 // Narrowing Floating-Point/Integer Type-Convert Instructions let Constraints = "@earlyclobber $vd", mayRaiseFPException = true in { @@ -1515,14 +1520,14 @@ defm VREDXOR : VRED_MV_V<"vredxor", 0b000011>; } // RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask // Vector Widening Integer Reduction Instructions -let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in { +let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 in { // Set earlyclobber for following instructions for second and mask operands. // This has the downside that the earlyclobber constraint is too coarse and // will impose unnecessary restrictions by not allowing the destination to // overlap with the first (wide) operand. defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>; defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask +} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 } // Predicates = [HasVInstructions] @@ -1543,7 +1548,7 @@ def : InstAlias<"vfredsum.vs $vd, $vs2, $vs1$vm", (VFREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>; // Vector Widening Floating-Point Reduction Instructions -let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in { +let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 in { // Set earlyclobber for following instructions for second and mask operands. // This has the downside that the earlyclobber constraint is too coarse and // will impose unnecessary restrictions by not allowing the destination to @@ -1552,7 +1557,7 @@ let Uses = [FRM], mayRaiseFPException = true in { defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>; defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>; } -} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask +} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm", (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>; @@ -1560,7 +1565,7 @@ def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm", let Predicates = [HasVInstructions] in { // Vector Mask-Register Logical Instructions -let RVVConstraint = NoConstraint in { +let RVVConstraint = NoConstraint, DestEEW = EEW1 in { defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">; defm VMNAND_M : VMALU_MV_Mask<"vmnand", 0b011101, "m">; defm VMANDN_M : VMALU_MV_Mask<"vmandn", 0b011000, "m">; @@ -1607,12 +1612,14 @@ def : InstAlias<"vpopc.m $vd, $vs2$vm", let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in { +let DestEEW = EEW1 in { // vmsbf.m set-before-first mask bit defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>; // vmsif.m set-including-first mask bit defm VMSIF_M : VMSFS_MV_V<"vmsif.m", 0b010100, 0b00011>; // vmsof.m set-only-first mask bit defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>; +} // DestEEW = EEW1 // Vector Iota Instruction defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 3c1fb38349d5c..851e817c50125 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -201,21 +201,24 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR, VR, FPR32>, Sched<[]>; } -let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod" in { +let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod", + DestEEW = EEWSEWx4 in { def VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">; def VQMACC_2x8x2 : CustomSiFiveVMACC<0b101101, OPMVV, "sf.vqmacc.2x8x2">; def VQMACCUS_2x8x2 : CustomSiFiveVMACC<0b101110, OPMVV, "sf.vqmaccus.2x8x2">; def VQMACCSU_2x8x2 : CustomSiFiveVMACC<0b101111, OPMVV, "sf.vqmaccsu.2x8x2">; } -let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq" in { +let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq", + DestEEW = EEWSEWx4 in { def VQMACCU_4x8x4 : CustomSiFiveVMACC<0b111100, OPMVV, "sf.vqmaccu.4x8x4">; def VQMACC_4x8x4 : CustomSiFiveVMACC<0b111101, OPMVV, "sf.vqmacc.4x8x4">; def VQMACCUS_4x8x4 : CustomSiFiveVMACC<0b111110, OPMVV, "sf.vqmaccus.4x8x4">; def VQMACCSU_4x8x4 : CustomSiFiveVMACC<0b111111, OPMVV, "sf.vqmaccsu.4x8x4">; } -let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq" in { +let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq", + DestEEW = EEWSEWx2 in { def VFWMACC_4x4x4 : CustomSiFiveVMACC<0b111100, OPFVV, "sf.vfwmacc.4x4x4">; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index 1b1f3b9b16e44..a79f757753325 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -19,7 +19,7 @@ let Predicates = [HasStdExtZvfbfmin], Constraints = "@earlyclobber $vd", mayRaiseFPException = true in { -let RVVConstraint = WidenCvt in +let RVVConstraint = WidenCvt, DestEEW = EEWSEWx2 in defm VFWCVTBF16_F_F_V : VWCVTF_FV_VS2<"vfwcvtbf16.f.f.v", 0b010010, 0b01101>; let Uses = [FRM] in defm VFNCVTBF16_F_F_W : VNCVTF_FV_VS2<"vfncvtbf16.f.f.w", 0b010010, 0b11101>; @@ -27,6 +27,7 @@ defm VFNCVTBF16_F_F_W : VNCVTF_FV_VS2<"vfncvtbf16.f.f.w", 0b010010, 0b11101>; let Predicates = [HasStdExtZvfbfwma], Constraints = "@earlyclobber $vd_wb, $vd = $vd_wb", - RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true in { + RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true, + DestEEW = EEWSEWx2 in { defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index e19a11805c9c0..7ec13e4eaafa7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -123,7 +123,8 @@ let Predicates = [HasStdExtZvbb] in { def VCLZ_V : VALUVs2<0b010010, 0b01100, OPMVV, "vclz.v">; def VCPOP_V : VALUVs2<0b010010, 0b01110, OPMVV, "vcpop.v">; def VCTZ_V : VALUVs2<0b010010, 0b01101, OPMVV, "vctz.v">; - let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in + let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, + DestEEW = EEWSEWx2 in defm VWSLL_V : VSHT_IV_V_X_I<"vwsll", 0b110101>; } // Predicates = [HasStdExtZvbb] diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 026e0a365b8dc..a612a03106f02 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -69,6 +69,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool foldUndefPassthruVMV_V_V(MachineInstr &MI); bool foldVMV_V_V(MachineInstr &MI); + bool hasSameEEW(const MachineInstr &User, const MachineInstr &Src) const; bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; @@ -98,10 +99,17 @@ static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { return LHS.getImm() <= RHS.getImm(); } -static unsigned getSEWLMULRatio(const MachineInstr &MI) { - RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags); - unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); - return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL); +/// Given \p User that has an input operand with EEW=SEW, which uses the dest +/// operand of \p Src with an unknown EEW, return true if their EEWs match. +bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User, + const MachineInstr &Src) const { + unsigned UserLog2SEW = + User.getOperand(RISCVII::getSEWOpNum(User.getDesc())).getImm(); + unsigned SrcLog2SEW = + Src.getOperand(RISCVII::getSEWOpNum(Src.getDesc())).getImm(); + unsigned SrcLog2EEW = RISCV::getDestLog2EEW( + TII->get(RISCV::getRVVMCOpcode(Src.getOpcode())), SrcLog2SEW); + return SrcLog2EEW == UserLog2SEW; } // Attempt to reduce the VL of an instruction whose sole use is feeding a @@ -154,8 +162,8 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { !RISCVII::hasSEWOp(Src->getDesc().TSFlags)) return false; - // Src needs to have the same VLMAX as MI - if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src)) + // Src's dest needs to have the same EEW as MI's input. + if (!hasSameEEW(MI, *Src)) return false; bool ElementsDependOnVL = RISCVII::elementsDependOnVL( @@ -486,8 +494,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) { if (Src && !Src->hasUnmodeledSideEffects() && MRI->hasOneUse(MI.getOperand(2).getReg()) && RISCVII::hasVLOp(Src->getDesc().TSFlags) && - RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) && - getSEWLMULRatio(MI) == getSEWLMULRatio(*Src)) { + RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) && hasSameEEW(MI, *Src)) { const MachineOperand &MIVL = MI.getOperand(3); const MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); @@ -532,8 +539,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) return false; - // Src needs to have the same VLMAX as MI - if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src)) + // Src's dest needs to have the same EEW as MI's input. + if (!hasSameEEW(MI, *Src)) return false; // Src needs to have the same passthru as VMV_V_V diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir index 6858231bf0e6c..2f02be025485c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir @@ -73,3 +73,19 @@ body: | %passthru:vr = COPY $v8 %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */ +... +--- +# Shouldn't be folded because the EEWs don't match +name: different_eew +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: different_eew + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF4 %passthru, $noreg, $noreg, 4, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_MF8 %passthru, %x, 4, 3 /* e8 */, 0 /* tu, mu */ + %passthru:vr = COPY $v8 + %x:vr = PseudoVADD_VV_MF4 %passthru, $noreg, $noreg, 4, 4 /* e16 */, 0 /* tu, mu */ + %y:vr = PseudoVMV_V_V_MF8 %passthru, %x, 4, 3 /* e8 */, 0 /* tu, mu */ diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp index fe711619c6320..cc0aca10fc6da 100644 --- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp +++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp @@ -316,6 +316,27 @@ TEST_P(RISCVInstrInfoTest, DescribeLoadedValue) { MF->deleteMachineBasicBlock(MBB); } +TEST_P(RISCVInstrInfoTest, GetDestEEW) { + const RISCVInstrInfo *TII = ST->getInstrInfo(); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VADD_VV), 3), 3u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWADD_VV), 3), 4u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VLE32_V), 5), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VLSE32_V), 5), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VREDSUM_VS), 4), 4u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWREDSUM_VS), 4), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWREDOSUM_VS), 5), 6u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFCVT_RTZ_XU_F_V), 4), 4u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWCVT_RTZ_XU_F_V), 4), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VSLL_VI), 4), 4u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWSLL_VI), 4), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VMSEQ_VV), 4), 0u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VMAND_MM), 0), 0u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VIOTA_M), 3), 3u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VQMACCU_2x8x2), 3), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWMACC_4x4x4), 4), 5u); + EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::THVdotVMAQA_VV), 5), 5u); +} + } // namespace INSTANTIATE_TEST_SUITE_P(RV32And64, RISCVInstrInfoTest, From f006246299c96486a8e37005a94e07c0bf334ee0 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 09:34:39 +0200 Subject: [PATCH 193/425] [CodeGen] Add generic INIT_UNDEF pseudo (#106744) The InitUndef pass currently uses target-specific pseudo instructions, with one pseudo per register class. Instead, add a generic pseudo instruction, which can be used by all targets and register classes. --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 9 - llvm/include/llvm/Support/TargetOpcodes.def | 5 + llvm/include/llvm/Target/Target.td | 7 + llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 4 + llvm/lib/CodeGen/InitUndef.cpp | 7 +- llvm/lib/Target/ARM/ARMAsmPrinter.cpp | 6 - llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 13 -- llvm/lib/Target/ARM/ARMInstrInfo.td | 12 -- llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 5 - llvm/lib/Target/RISCV/RISCVInstrInfo.h | 15 -- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 9 - .../rvv/handle-noreg-with-implicit-def.mir | 4 +- .../rvv/subregister-undef-early-clobber.mir | 176 +++++++++--------- .../RISCV/rvv/undef-earlyclobber-chain.mir | 4 +- .../Thumb2/mve-laneinterleaving-cost.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 10 +- .../match-table-imms.td | 2 +- 17 files changed, 119 insertions(+), 173 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 49ce13dd8cbe3..65c5788ac5cc9 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -2278,15 +2278,6 @@ class TargetInstrInfo : public MCInstrInfo { llvm_unreachable("unknown number of operands necessary"); } - /// Gets the opcode for the Pseudo Instruction used to initialize - /// the undef value. If no Instruction is available, this will - /// fail compilation. - virtual unsigned getUndefInitOpcode(unsigned RegClassID) const { - (void)RegClassID; - - llvm_unreachable("Unexpected register class."); - } - private: mutable std::unique_ptr Formatter; unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode; diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 635c265a43363..e1883de0c93b4 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -56,6 +56,11 @@ HANDLE_TARGET_OPCODE(INSERT_SUBREG) /// IMPLICIT_DEF - This is the MachineInstr-level equivalent of undef. HANDLE_TARGET_OPCODE(IMPLICIT_DEF) +/// Explicit undef initialization used past IMPLICIT_DEF elimination in cases +/// where an undef operand must be allocated to a different register than an +/// early-clobber result operand. +HANDLE_TARGET_OPCODE(INIT_UNDEF) + /// SUBREG_TO_REG - Assert the value of bits in a super register. /// The result of this instruction is the value of the second operand inserted /// into the subregister specified by the third operand. All other bits are diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index b2eb250ae60b6..3e037affe1cfd 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1254,6 +1254,13 @@ def IMPLICIT_DEF : StandardPseudoInstruction { let isAsCheapAsAMove = true; let isMeta = true; } +def INIT_UNDEF : StandardPseudoInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins); + let AsmString = ""; + let hasSideEffects = false; + let Size = 0; +} def SUBREG_TO_REG : StandardPseudoInstruction { let OutOperandList = (outs unknown:$dst); let InOperandList = (ins unknown:$implsrc, unknown:$subsrc, i32imm:$subidx); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 19d23c8ba9678..88e9b9d27d3f2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1832,6 +1832,10 @@ void AsmPrinter::emitFunctionBody() { // This instruction is only used to note jump table debug info, it's // purely meta information. break; + case TargetOpcode::INIT_UNDEF: + // This is only used to influence register allocation behavior, no + // actual initialization is needed. + break; default: emitInstruction(&MI); if (CanDoExtraAnalysis) { diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index 7c1b90afd495e..8d20f2668de6b 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -177,8 +177,7 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI, Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass); LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n"); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), - TII->get(TII->getUndefInitOpcode(SubRegClass->getID())), - TmpInitSubReg); + TII->get(TargetOpcode::INIT_UNDEF), TmpInitSubReg); Register NewReg = MRI->createVirtualRegister(TargetRegClass); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(TargetOpcode::INSERT_SUBREG), NewReg) @@ -203,9 +202,9 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) { const TargetRegisterClass *TargetRegClass = TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg())); LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n"); - unsigned Opcode = TII->getUndefInitOpcode(TargetRegClass->getID()); Register NewReg = MRI->createVirtualRegister(TargetRegClass); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(Opcode), NewReg); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(TargetOpcode::INIT_UNDEF), NewReg); MO.setReg(NewReg); if (MO.isUndef()) MO.setIsUndef(false); diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 8eb5d91d3b879..710182985a1e9 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -2411,12 +2411,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { case ARM::SEH_EpilogEnd: ATS.emitARMWinCFIEpilogEnd(); return; - - case ARM::PseudoARMInitUndefMQPR: - case ARM::PseudoARMInitUndefSPR: - case ARM::PseudoARMInitUndefDPR_VFP2: - case ARM::PseudoARMInitUndefGPR: - return; } MCInst TmpInst; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 27290f7f76347..aee9797585dbd 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -546,19 +546,6 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { std::optional isAddImmediate(const MachineInstr &MI, Register Reg) const override; - - unsigned getUndefInitOpcode(unsigned RegClassID) const override { - if (RegClassID == ARM::MQPRRegClass.getID()) - return ARM::PseudoARMInitUndefMQPR; - if (RegClassID == ARM::SPRRegClass.getID()) - return ARM::PseudoARMInitUndefSPR; - if (RegClassID == ARM::DPR_VFP2RegClass.getID()) - return ARM::PseudoARMInitUndefDPR_VFP2; - if (RegClassID == ARM::GPRRegClass.getID()) - return ARM::PseudoARMInitUndefGPR; - - llvm_unreachable("Unexpected register class."); - } }; /// Get the operands corresponding to the given \p Pred value. By default, the diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 0fc561382084e..ed68c6ff20cde 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6549,15 +6549,3 @@ let isPseudo = 1 in { let isTerminator = 1 in def SEH_EpilogEnd : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; } - - -//===----------------------------------------------------------------------===// -// Pseudo Instructions for use when early-clobber is defined and Greedy Register -// Allocation is used. This ensures the constraint is used properly. -//===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in { - def PseudoARMInitUndefMQPR : PseudoInst<(outs MQPR:$vd), (ins), NoItinerary, []>; - def PseudoARMInitUndefSPR : PseudoInst<(outs SPR:$sd), (ins), NoItinerary, []>; - def PseudoARMInitUndefDPR_VFP2 : PseudoInst<(outs DPR_VFP2:$dd), (ins), NoItinerary, []>; - def PseudoARMInitUndefGPR : PseudoInst<(outs GPR:$rd), (ins), NoItinerary, []>; -} diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 476dde2be39e5..24bca2da652d0 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -303,11 +303,6 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { case RISCV::KCFI_CHECK: LowerKCFI_CHECK(*MI); return; - case RISCV::PseudoRVVInitUndefM1: - case RISCV::PseudoRVVInitUndefM2: - case RISCV::PseudoRVVInitUndefM4: - case RISCV::PseudoRVVInitUndefM8: - return; case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); case TargetOpcode::PATCHPOINT: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index e040891539ddf..457db9b9860d0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -293,21 +293,6 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override; - unsigned getUndefInitOpcode(unsigned RegClassID) const override { - switch (RegClassID) { - case RISCV::VRRegClassID: - return RISCV::PseudoRVVInitUndefM1; - case RISCV::VRM2RegClassID: - return RISCV::PseudoRVVInitUndefM2; - case RISCV::VRM4RegClassID: - return RISCV::PseudoRVVInitUndefM4; - case RISCV::VRM8RegClassID: - return RISCV::PseudoRVVInitUndefM8; - default: - llvm_unreachable("Unexpected register class."); - } - } - protected: const RISCVSubtarget &STI; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 1b4303fbbcf80..c91c9c3614a34 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -6116,15 +6116,6 @@ foreach lmul = MxList in { } } -/// Empty pseudo for RISCVInitUndefPass -let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 0, - isCodeGenOnly = 1 in { - def PseudoRVVInitUndefM1 : Pseudo<(outs VR:$vd), (ins), [], "">; - def PseudoRVVInitUndefM2 : Pseudo<(outs VRM2:$vd), (ins), [], "">; - def PseudoRVVInitUndefM4 : Pseudo<(outs VRM4:$vd), (ins), [], "">; - def PseudoRVVInitUndefM8 : Pseudo<(outs VRM8:$vd), (ins), [], "">; -} - //===----------------------------------------------------------------------===// // 6. Configuration-Setting Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir index e090b313d4f7b..7b4d200ef8a3b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir +++ b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir @@ -9,8 +9,8 @@ body: | ; MIR-LABEL: name: vrgather_all_undef ; MIR: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF ; MIR-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF - ; MIR-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; MIR-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 [[DEF1]], killed [[PseudoRVVInitUndefM1_]], 0, 0, 5 /* e32 */, 0 /* tu, mu */ + ; MIR-NEXT: [[INIT_UNDEF:%[0-9]+]]:vr = INIT_UNDEF + ; MIR-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 [[DEF1]], killed [[INIT_UNDEF]], 0, 0, 5 /* e32 */, 0 /* tu, mu */ ; MIR-NEXT: $v8 = COPY %1 ; MIR-NEXT: PseudoRET implicit $v8 %2:vr = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir index 539d319f3426d..be6ed4d2a6aa1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir +++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir @@ -14,10 +14,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_1 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_1 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -52,10 +52,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_0 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_0 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -90,10 +90,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_3 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_3 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -128,10 +128,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_2 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_2 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -166,8 +166,8 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -202,8 +202,8 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -239,12 +239,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_1 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -279,12 +279,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_0 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_0 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -319,12 +319,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_3 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_3 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -359,12 +359,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_2 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_2 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -399,12 +399,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_4 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_5 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_5 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -439,12 +439,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_5 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_4 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_4 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -479,12 +479,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_6 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_7 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_7 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -519,12 +519,12 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_7 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2 - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_6 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_6 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -559,10 +559,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -597,10 +597,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -635,10 +635,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_2 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -673,10 +673,10 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_3 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2 - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -711,8 +711,8 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype @@ -747,8 +747,8 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4 - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0 + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir index 8df2c60c926c3..69078710e9ccf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir +++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir @@ -78,8 +78,8 @@ body: | ; CHECK-LABEL: name: undef_early_clobber_chain ; CHECK: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1 - ; CHECK-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 undef [[DEF]], [[PseudoRVVInitUndefM1_]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 undef [[DEF]], [[INIT_UNDEF]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v8 = COPY %1 ; CHECK-NEXT: PseudoRET implicit $v8 %2:vr = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll index c2511a4992cf5..e86c368e0fe8a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -365,13 +365,13 @@ define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) { ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: ldr.w lr, [sp, #20] -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: vmov.f32 s4, s6 ; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: smull r12, r3, r1, r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.f32 s0, s2 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 6d581afe9fb31..8eb941371f993 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -229,9 +229,9 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 ; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov r4, r7, d12 @@ -780,8 +780,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmullb.u32 q5, q4, q3 ; CHECK-NEXT: vmov.f32 s6, s5 @@ -792,6 +792,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: vmullb.u32 q4, q2, q1 ; CHECK-NEXT: bfi r6, r5, #0, #8 ; CHECK-NEXT: vmov r4, r5, d11 ; CHECK-NEXT: lsrl r4, r5, #31 @@ -800,12 +801,11 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: csetm r5, lo ; CHECK-NEXT: bfi r6, r5, #8, #8 -; CHECK-NEXT: vmsr p0, r6 -; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: vmullb.u32 q4, q2, q1 ; CHECK-NEXT: vmov r10, r5, d8 ; CHECK-NEXT: lsrl r10, r5, #31 +; CHECK-NEXT: vmsr p0, r6 ; CHECK-NEXT: subs.w r6, r10, #-1 +; CHECK-NEXT: vpsel q3, q3, q0 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lo diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td index d9a8854cd018f..e0b802447ea2a 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td @@ -34,7 +34,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(20), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]), // CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4([[L493:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4({{[0-9]+}}), From f1ac334b13c22222ed5c71bad04ed8345b2be135 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 5 Sep 2024 08:41:39 +0100 Subject: [PATCH 194/425] [Clang][SemaCXX] Preserve qualifiers in derived-to-base cast in defaulted comparison operators (#102619) Fixes #102588 Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/SemaDeclCXX.cpp | 10 ++-- clang/test/SemaCXX/cxx20-default-compare.cpp | 50 ++++++++++++++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1520f7a2916aa..44ffad94ef41f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -355,6 +355,8 @@ Bug Fixes to C++ Support - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373) - Fixed a bug in the substitution of empty pack indexing types. (#GH105903) - Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048) +- Fixed a bug where defaulted comparison operators would remove ``const`` from base classes. (#GH102588) + Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 3044f1218f5b2..f90f16c2923d0 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -8450,10 +8450,12 @@ class DefaultedComparisonSynthesizer if (Obj.first.isInvalid() || Obj.second.isInvalid()) return {ExprError(), ExprError()}; CXXCastPath Path = {Base}; - return {S.ImpCastExprToType(Obj.first.get(), Base->getType(), - CK_DerivedToBase, VK_LValue, &Path), - S.ImpCastExprToType(Obj.second.get(), Base->getType(), - CK_DerivedToBase, VK_LValue, &Path)}; + const auto CastToBase = [&](Expr *E) { + QualType ToType = S.Context.getQualifiedType( + Base->getType(), E->getType().getQualifiers()); + return S.ImpCastExprToType(E, ToType, CK_DerivedToBase, VK_LValue, &Path); + }; + return {CastToBase(Obj.first.get()), CastToBase(Obj.second.get())}; } ExprPair getField(FieldDecl *Field) { diff --git a/clang/test/SemaCXX/cxx20-default-compare.cpp b/clang/test/SemaCXX/cxx20-default-compare.cpp index 7074ee885ac4a..3e4673c31e489 100644 --- a/clang/test/SemaCXX/cxx20-default-compare.cpp +++ b/clang/test/SemaCXX/cxx20-default-compare.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 %s -std=c++23 -verify -Wfloat-equal +#include "Inputs/std-compare.h" + struct Foo { float val; bool operator==(const Foo &) const; @@ -15,3 +17,51 @@ bool operator==(const Foo &, const Foo &) = default; // expected-warning {{comp // Declare the defaulted comparison function as a non-member function. Arguments are passed by value. bool operator==(Foo, Foo) = default; // expected-warning {{comparing floating point with == or != is unsafe}} expected-note {{in defaulted equality comparison operator for 'Foo' first required here}} + +namespace GH102588 { +struct A { + int i = 0; + constexpr operator int() const { return i; } + constexpr operator int&() { return ++i; } +}; + +struct B : A { + bool operator==(const B &) const = default; +}; + +constexpr bool f() { + B x; + return x == x; +} + +static_assert(f()); + +struct ConstOnly { + std::strong_ordering operator<=>(const ConstOnly&) const; + std::strong_ordering operator<=>(ConstOnly&) = delete; + friend bool operator==(const ConstOnly&, const ConstOnly&); + friend bool operator==(ConstOnly&, ConstOnly&) = delete; +}; + +struct MutOnly { + std::strong_ordering operator<=>(const MutOnly&) const = delete;; + std::strong_ordering operator<=>(MutOnly&); + friend bool operator==(const MutOnly&, const MutOnly&) = delete;; + friend bool operator==(MutOnly&, MutOnly&); +}; + +struct ConstCheck : ConstOnly { + friend std::strong_ordering operator<=>(const ConstCheck&, const ConstCheck&) = default; + std::strong_ordering operator<=>(ConstCheck const& __restrict) const __restrict = default; + friend bool operator==(const ConstCheck&, const ConstCheck&) = default; + bool operator==(this const ConstCheck&, const ConstCheck&) = default; +}; + +// FIXME: Non-reference explicit object parameter are rejected +struct MutCheck : MutOnly { + friend bool operator==(MutCheck, MutCheck) = default; + // std::strong_ordering operator<=>(this MutCheck, MutCheck) = default; + friend std::strong_ordering operator<=>(MutCheck, MutCheck) = default; + // bool operator==(this MutCheck, MutCheck) = default; +}; +} From cf1ad28169be5d026ec95f351b56b0c090b3e682 Mon Sep 17 00:00:00 2001 From: Daniel Grumberg Date: Thu, 5 Sep 2024 09:15:09 +0100 Subject: [PATCH 195/425] [clang][ExtractAPI] Handle AttributedType fragments transparently (#107262) rdar://131958623 --- clang/lib/ExtractAPI/DeclarationFragments.cpp | 13 ++++++++++ clang/test/ExtractAPI/attributed-typedef.m | 24 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 clang/test/ExtractAPI/attributed-typedef.m diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp index d77bb1d424f7c..06ce5ed6a6475 100644 --- a/clang/lib/ExtractAPI/DeclarationFragments.cpp +++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp @@ -276,6 +276,19 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType( DeclarationFragments Fragments; + if (const MacroQualifiedType *MQT = dyn_cast(T)) { + Fragments.append( + getFragmentsForType(MQT->getUnderlyingType(), Context, After)); + return Fragments; + } + + if (const AttributedType *AT = dyn_cast(T)) { + // FIXME: Serialize Attributes correctly + Fragments.append( + getFragmentsForType(AT->getModifiedType(), Context, After)); + return Fragments; + } + // An ElaboratedType is a sugar for types that are referred to using an // elaborated keyword, e.g., `struct S`, `enum E`, or (in C++) via a // qualified name, e.g., `N::M::type`, or both. diff --git a/clang/test/ExtractAPI/attributed-typedef.m b/clang/test/ExtractAPI/attributed-typedef.m new file mode 100644 index 0000000000000..c948c873ab759 --- /dev/null +++ b/clang/test/ExtractAPI/attributed-typedef.m @@ -0,0 +1,24 @@ +// RUN: rm -rf %t +// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ +// RUN: -triple arm64-apple-macosx -x objective-c-header %s -o %t/output.symbols.json + +_Pragma("clang assume_nonnull begin") + +struct Foo { int a; }; +typedef struct Foo *Bar; +// RUN: FileCheck %s -input-file %t/output.symbols.json --check-prefix FUNC +void func(Bar b); +// FUNC-LABEL: "!testLabel": "c:@F@func", +// CHECK-NOT: Foo +// CHECK: "pathComponents" + +// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix THING +#define SWIFT_NAME(_name) __attribute__((swift_name(#_name))) +extern Bar const thing SWIFT_NAME(swiftThing); +// THING-LABEL: "!testLabel": "c:@thing" +// THING-NOT: Foo +// THING: "pathComponents" + +_Pragma("clang assume_nonnull end") + +// expected-no-diagnostics From 41373098421f2aa551a0879537864c87d797a102 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 5 Sep 2024 09:15:54 +0100 Subject: [PATCH 196/425] [Clang] Warn with -Wpre-c23-compat instead of -Wpre-c++17-compat for u8 character literals in C23 (#97210) Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/Basic/DiagnosticLexKinds.td | 3 +++ clang/lib/Lex/Lexer.cpp | 4 +++- clang/test/Sema/pre-c2x-compat.c | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 44ffad94ef41f..bd84a2e40fb8b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -278,6 +278,8 @@ Improvements to Clang's diagnostics - The lifetimebound and GSL analysis in clang are coherent, allowing clang to detect more use-after-free bugs. (#GH100549). +- Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 12d7b8c0205ee..fc14bb6aa2165 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -283,6 +283,9 @@ def warn_cxx98_compat_unicode_literal : Warning< def warn_cxx14_compat_u8_character_literal : Warning< "unicode literals are incompatible with C++ standards before C++17">, InGroup, DefaultIgnore; +def warn_c17_compat_u8_character_literal : Warning< + "unicode literals are incompatible with C standards before C23">, + InGroup, DefaultIgnore; def warn_cxx11_compat_user_defined_literal : Warning< "identifier after literal will be treated as a user-defined literal suffix " "in C++11">, InGroup, DefaultIgnore; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index ef1e1f4bd9aeb..8647e9f2f27c3 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -2428,7 +2428,9 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, ? diag::warn_cxx98_compat_unicode_literal : diag::warn_c99_compat_unicode_literal); else if (Kind == tok::utf8_char_constant) - Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); + Diag(BufferPtr, LangOpts.CPlusPlus + ? diag::warn_cxx14_compat_u8_character_literal + : diag::warn_c17_compat_u8_character_literal); } char C = getAndAdvanceChar(CurPtr, Result); diff --git a/clang/test/Sema/pre-c2x-compat.c b/clang/test/Sema/pre-c2x-compat.c index fad472f1f72d5..15bb9b58349fa 100644 --- a/clang/test/Sema/pre-c2x-compat.c +++ b/clang/test/Sema/pre-c2x-compat.c @@ -1,3 +1,4 @@ // RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -pedantic -fsyntax-only -verify int digit_seps = 123'456; // expected-warning {{digit separators are incompatible with C standards before C23}} +unsigned char u8_char = u8'x'; // expected-warning {{unicode literals are incompatible with C standards before C23}} From 3e4788377bb29ed389b46521fcba0d06aa985bcf Mon Sep 17 00:00:00 2001 From: Giulio Eulisse <10544+ktf@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:16:51 +0200 Subject: [PATCH 197/425] Recover performance loss after PagedVector introduction (#67972) --- clang/include/clang/Basic/SourceManager.h | 2 +- llvm/include/llvm/ADT/PagedVector.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index d3ccc7ef81c07..e0f1ea435d54e 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -724,7 +724,7 @@ class SourceManager : public RefCountedBase { /// /// Negative FileIDs are indexes into this table. To get from ID to an index, /// use (-ID - 2). - llvm::PagedVector LoadedSLocEntryTable; + llvm::PagedVector LoadedSLocEntryTable; /// For each allocation in LoadedSLocEntryTable, we keep the first FileID. /// We assume exactly one allocation per AST file, and use that to determine diff --git a/llvm/include/llvm/ADT/PagedVector.h b/llvm/include/llvm/ADT/PagedVector.h index 3fcca6d82cb33..52ecd0bb0ba11 100644 --- a/llvm/include/llvm/ADT/PagedVector.h +++ b/llvm/include/llvm/ADT/PagedVector.h @@ -84,7 +84,7 @@ template class PagedVector { assert(Index / PageSize < PageToDataPtrs.size()); T *&PagePtr = PageToDataPtrs[Index / PageSize]; // If the page was not yet allocated, allocate it. - if (!PagePtr) { + if (LLVM_UNLIKELY(!PagePtr)) { PagePtr = Allocator.getPointer()->template Allocate(PageSize); // We need to invoke the default constructor on all the elements of the // page. From b206bf0952796cb93f1aca9e47d5764e474e1998 Mon Sep 17 00:00:00 2001 From: Konrad Kleine Date: Thu, 5 Sep 2024 10:41:18 +0200 Subject: [PATCH 198/425] Fix CLANG_BOOTSTRAP_TARGETS in Release.cmake (#106407) # Problem Before this patch you could not build the `stage2-LLVM` for example because you first had to manually add it to `CLANG_BOOTSTRAP_TARGETS` in the `Release.cmake` and also add it to `LLVM_RELEASE_FINAL_STAGE_TARGETS` in the cmake configure run. Now you can just use `-DLLVM_RELEASE_FINAL_STAGE_TARGETS="LLVM;clang"` on the cmake CLI and be able to build the targets `stage2-LLVM` and `stage2-clang` without further changes to the cache file. # Solution Take all `LLVM_RELEASE_FINAL_STAGE_TARGETS` elements and append them prefixed with `stage2-` to `CLANG_BOOTSTRAP_TARGETS`. Afterwards all duplicates are removed. --- clang/cmake/caches/Release.cmake | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake index 6d5f75ca0074e..c93ff40ff3ee4 100644 --- a/clang/cmake/caches/Release.cmake +++ b/clang/cmake/caches/Release.cmake @@ -55,14 +55,22 @@ set(STAGE1_RUNTIMES "compiler-rt") if (LLVM_RELEASE_ENABLE_PGO) list(APPEND STAGE1_PROJECTS "lld") - set(CLANG_BOOTSTRAP_TARGETS + set(tmp_targets generate-profdata stage2-package stage2-clang + stage2 stage2-install stage2-check-all stage2-check-llvm - stage2-check-clang CACHE STRING "") + stage2-check-clang) + + foreach(X IN LISTS LLVM_RELEASE_FINAL_STAGE_TARGETS) + list(APPEND tmp_targets "stage2-${X}") + endforeach() + list(REMOVE_DUPLICATES tmp_targets) + + set(CLANG_BOOTSTRAP_TARGETS "${tmp_targets}" CACHE STRING "") # Configuration for stage2-instrumented set(BOOTSTRAP_CLANG_ENABLE_BOOTSTRAP ON CACHE STRING "") From 3413f957243e4a152726e572986eb730699b8486 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 5 Sep 2024 10:01:52 +0100 Subject: [PATCH 199/425] [AArch64] Add a few extra two-step zext shuffle tests. NFC --- llvm/test/CodeGen/AArch64/zext-shuffle.ll | 78 +++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index af5a92017bbbc..6415fba29ff79 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -263,6 +263,84 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { } +define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_0246: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_1357: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_04812: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_15913: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_261014: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: v8i32_371115: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> + %d = zext <8 x i8> %c to <8 x i32> + ret <8 x i32> %d +} + + define <8 x i64> @zext_add(<32 x i16> %l) { ; CHECK-LABEL: zext_add: ; CHECK: // %bb.0: From a95b212e9957b8f5b7d452b4713a7b6f9ee19e71 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 5 Sep 2024 10:34:34 +0100 Subject: [PATCH 200/425] [DWARF] Don't search scope chain to find DISubprogram for prologues (#107261) Seemingly this goes back to fd07a2a in 2015 -- I anticipate that back then the metadata layout was radically different. But nowadays at least, we can just directly look up the subprogram. --- llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 77e304383fb5c..9da9382baf863 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2198,11 +2198,10 @@ DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, // Ensure the compile unit is created if the function is called before // beginFunction(). - (void)getOrCreateDwarfCompileUnit( - MF.getFunction().getSubprogram()->getUnit()); + DISubprogram *SP = MF.getFunction().getSubprogram(); + (void)getOrCreateDwarfCompileUnit(SP->getUnit()); // We'd like to list the prologue as "not statements" but GDB behaves // poorly if we do that. Revisit this with caution/GDB (7.5+) testing. - const DISubprogram *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram(); ::recordSourceLine(*Asm, SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT, CUID, getDwarfVersion(), getUnits()); return PrologEndLoc; From 03d5b7ca3d83eee3514318ef8934ba26bc3d7fa9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 10:14:34 +0200 Subject: [PATCH 201/425] [MemorySanitizer] Don't create types pointers (NFC) Everything in this pass uses a single addrspace 0 pointer type. Don't try to create it using the typed pointer ctor. This allows removing the type argument from getShadowPtrForVAArgument(). --- .../Instrumentation/MemorySanitizer.cpp | 129 +++++++----------- 1 file changed, 52 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 0b3d85afcacf1..17c5638cf8ced 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -769,8 +769,7 @@ MemorySanitizer::getOrInsertMsanMetadataFunction(Module &M, StringRef Name, ArgsTy... Args) { if (TargetTriple.getArch() == Triple::systemz) { // SystemZ ABI: shadow/origin pair is returned via a hidden parameter. - return M.getOrInsertFunction(Name, Type::getVoidTy(*C), - PointerType::get(MsanMetadata, 0), + return M.getOrInsertFunction(Name, Type::getVoidTy(*C), PtrTy, std::forward(Args)...); } @@ -804,29 +803,26 @@ void MemorySanitizer::createKernelApi(Module &M, const TargetLibraryInfo &TLI) { ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */ IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy, OriginTy); - MsanGetContextStateFn = M.getOrInsertFunction( - "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0)); + MsanGetContextStateFn = + M.getOrInsertFunction("__msan_get_context_state", PtrTy); - MsanMetadata = StructType::get(PointerType::get(IRB.getInt8Ty(), 0), - PointerType::get(IRB.getInt32Ty(), 0)); + MsanMetadata = StructType::get(PtrTy, PtrTy); for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) { std::string name_load = "__msan_metadata_ptr_for_load_" + std::to_string(size); std::string name_store = "__msan_metadata_ptr_for_store_" + std::to_string(size); - MsanMetadataPtrForLoad_1_8[ind] = getOrInsertMsanMetadataFunction( - M, name_load, PointerType::get(IRB.getInt8Ty(), 0)); - MsanMetadataPtrForStore_1_8[ind] = getOrInsertMsanMetadataFunction( - M, name_store, PointerType::get(IRB.getInt8Ty(), 0)); + MsanMetadataPtrForLoad_1_8[ind] = + getOrInsertMsanMetadataFunction(M, name_load, PtrTy); + MsanMetadataPtrForStore_1_8[ind] = + getOrInsertMsanMetadataFunction(M, name_store, PtrTy); } MsanMetadataPtrForLoadN = getOrInsertMsanMetadataFunction( - M, "__msan_metadata_ptr_for_load_n", PointerType::get(IRB.getInt8Ty(), 0), - IRB.getInt64Ty()); + M, "__msan_metadata_ptr_for_load_n", PtrTy, IRB.getInt64Ty()); MsanMetadataPtrForStoreN = getOrInsertMsanMetadataFunction( - M, "__msan_metadata_ptr_for_store_n", - PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty()); + M, "__msan_metadata_ptr_for_store_n", PtrTy, IRB.getInt64Ty()); // Functions for poisoning and unpoisoning memory. MsanPoisonAllocaFn = M.getOrInsertFunction( @@ -937,9 +933,8 @@ void MemorySanitizer::initializeCallbacks(Module &M, const TargetLibraryInfo &TL TLI.getAttrList(C, {1}, /*Signed=*/true), PtrTy, PtrTy, IRB.getInt32Ty(), IntptrTy); - MsanInstrumentAsmStoreFn = - M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(), - PointerType::get(IRB.getInt8Ty(), 0), IntptrTy); + MsanInstrumentAsmStoreFn = M.getOrInsertFunction( + "__msan_instrument_asm_store", IRB.getVoidTy(), PtrTy, IntptrTy); if (CompileKernel) { createKernelApi(M, TLI); @@ -1264,8 +1259,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Align CurrentAlignment = Alignment; if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { Value *IntptrOrigin = originToIntptr(IRB, Origin); - Value *IntptrOriginPtr = - IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0)); + Value *IntptrOriginPtr = IRB.CreatePointerCast(OriginPtr, MS.PtrTy); for (unsigned i = 0; i < Size / IntptrSize; ++i) { Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i) : IntptrOriginPtr; @@ -1691,7 +1685,7 @@ struct MemorySanitizerVisitor : public InstVisitor { VectTy->getElementCount()); } assert(IntPtrTy == MS.IntptrTy); - return PointerType::get(*MS.C, 0); + return MS.PtrTy; } Constant *constToIntPtr(Type *IntPtrTy, uint64_t C) const { @@ -1787,8 +1781,7 @@ struct MemorySanitizerVisitor : public InstVisitor { TypeSize Size = DL.getTypeStoreSize(ShadowTy); FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size); - Value *AddrCast = - IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0)); + Value *AddrCast = IRB.CreatePointerCast(Addr, MS.PtrTy); if (Getter) { ShadowOriginPtrs = createMetadataCall(IRB, Getter, AddrCast); } else { @@ -1799,7 +1792,7 @@ struct MemorySanitizerVisitor : public InstVisitor { AddrCast, SizeVal); } Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0); - ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0)); + ShadowPtr = IRB.CreatePointerCast(ShadowPtr, MS.PtrTy); Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1); return std::make_pair(ShadowPtr, OriginPtr); @@ -5009,21 +5002,19 @@ struct VarArgHelperBase : public VarArgHelper { } /// Compute the shadow address for a given va_arg. - Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, - unsigned ArgOffset) { + Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) { Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), - "_msarg_va_s"); + return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_s"); } /// Compute the shadow address for a given va_arg. - Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, - unsigned ArgOffset, unsigned ArgSize) { + Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset, + unsigned ArgSize) { // Make sure we don't overflow __msan_va_arg_tls. if (ArgOffset + ArgSize > kParamTLSSize) return nullptr; - return getShadowPtrForVAArgument(Ty, IRB, ArgOffset); + return getShadowPtrForVAArgument(IRB, ArgOffset); } /// Compute the origin address for a given va_arg. @@ -5033,8 +5024,7 @@ struct VarArgHelperBase : public VarArgHelper { // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never // overflow. Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), - "_msarg_va_o"); + return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_o"); } void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase, @@ -5147,8 +5137,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase { uint64_t ArgSize = DL.getTypeAllocSize(RealTy); uint64_t AlignedSize = alignTo(ArgSize, 8); unsigned BaseOffset = OverflowOffset; - Value *ShadowBase = - getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset); + Value *ShadowBase = getShadowPtrForVAArgument(IRB, OverflowOffset); Value *OriginBase = nullptr; if (MS.TrackOrigins) OriginBase = getOriginPtrForVAArgument(IRB, OverflowOffset); @@ -5177,14 +5166,14 @@ struct VarArgAMD64Helper : public VarArgHelperBase { Value *ShadowBase, *OriginBase = nullptr; switch (AK) { case AK_GeneralPurpose: - ShadowBase = getShadowPtrForVAArgument(A->getType(), IRB, GpOffset); + ShadowBase = getShadowPtrForVAArgument(IRB, GpOffset); if (MS.TrackOrigins) OriginBase = getOriginPtrForVAArgument(IRB, GpOffset); GpOffset += 8; assert(GpOffset <= kParamTLSSize); break; case AK_FloatingPoint: - ShadowBase = getShadowPtrForVAArgument(A->getType(), IRB, FpOffset); + ShadowBase = getShadowPtrForVAArgument(IRB, FpOffset); if (MS.TrackOrigins) OriginBase = getOriginPtrForVAArgument(IRB, FpOffset); FpOffset += 16; @@ -5196,8 +5185,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase { uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); uint64_t AlignedSize = alignTo(ArgSize, 8); unsigned BaseOffset = OverflowOffset; - ShadowBase = - getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); + ShadowBase = getShadowPtrForVAArgument(IRB, OverflowOffset); if (MS.TrackOrigins) { OriginBase = getOriginPtrForVAArgument(IRB, OverflowOffset); } @@ -5263,13 +5251,11 @@ struct VarArgAMD64Helper : public VarArgHelperBase { NextNodeIRBuilder IRB(OrigInst); Value *VAListTag = OrigInst->getArgOperand(0); - Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, 16)), - PointerType::get(RegSaveAreaPtrTy, 0)); - Value *RegSaveAreaPtr = - IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); + MS.PtrTy); + Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; const Align Alignment = Align(16); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = @@ -5280,13 +5266,12 @@ struct VarArgAMD64Helper : public VarArgHelperBase { if (MS.TrackOrigins) IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy, Alignment, AMD64FpEndOffset); - Type *OverflowArgAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, 8)), - PointerType::get(OverflowArgAreaPtrTy, 0)); + MS.PtrTy); Value *OverflowArgAreaPtr = - IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr); + IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr; std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) = MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(), @@ -5329,7 +5314,7 @@ struct VarArgMIPS64Helper : public VarArgHelperBase { if (ArgSize < 8) VAArgOffset += (8 - ArgSize); } - Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize); + Base = getShadowPtrForVAArgument(IRB, VAArgOffset, ArgSize); VAArgOffset += ArgSize; VAArgOffset = alignTo(VAArgOffset, 8); if (!Base) @@ -5371,12 +5356,9 @@ struct VarArgMIPS64Helper : public VarArgHelperBase { for (CallInst *OrigInst : VAStartInstrumentationList) { NextNodeIRBuilder IRB(OrigInst); Value *VAListTag = OrigInst->getArgOperand(0); - Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* - Value *RegSaveAreaPtrPtr = - IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - PointerType::get(RegSaveAreaPtrTy, 0)); - Value *RegSaveAreaPtr = - IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); + Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( + IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), MS.PtrTy); + Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = @@ -5460,11 +5442,11 @@ struct VarArgAArch64Helper : public VarArgHelperBase { Value *Base; switch (AK) { case AK_GeneralPurpose: - Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset); + Base = getShadowPtrForVAArgument(IRB, GrOffset); GrOffset += 8 * RegNum; break; case AK_FloatingPoint: - Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset); + Base = getShadowPtrForVAArgument(IRB, VrOffset); VrOffset += 16 * RegNum; break; case AK_Memory: @@ -5475,7 +5457,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase { uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); uint64_t AlignedSize = alignTo(ArgSize, 8); unsigned BaseOffset = OverflowOffset; - Base = getShadowPtrForVAArgument(A->getType(), IRB, BaseOffset); + Base = getShadowPtrForVAArgument(IRB, BaseOffset); OverflowOffset += AlignedSize; if (OverflowOffset > kParamTLSSize) { // We have no space to copy shadow there. @@ -5500,7 +5482,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase { Value *SaveAreaPtrPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, offset)), - PointerType::get(*MS.C, 0)); + MS.PtrTy); return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr); } @@ -5509,7 +5491,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase { Value *SaveAreaPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, offset)), - PointerType::get(*MS.C, 0)); + MS.PtrTy); Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr); return IRB.CreateSExt(SaveArea32, MS.IntptrTy); } @@ -5670,8 +5652,8 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase { ArgAlign = Align(8); VAArgOffset = alignTo(VAArgOffset, ArgAlign); if (!IsFixed) { - Value *Base = getShadowPtrForVAArgument( - RealTy, IRB, VAArgOffset - VAArgBase, ArgSize); + Value *Base = + getShadowPtrForVAArgument(IRB, VAArgOffset - VAArgBase, ArgSize); if (Base) { Value *AShadowPtr, *AOriginPtr; std::tie(AShadowPtr, AOriginPtr) = @@ -5707,8 +5689,8 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase { VAArgOffset += (8 - ArgSize); } if (!IsFixed) { - Base = getShadowPtrForVAArgument(A->getType(), IRB, - VAArgOffset - VAArgBase, ArgSize); + Base = + getShadowPtrForVAArgument(IRB, VAArgOffset - VAArgBase, ArgSize); if (Base) IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); } @@ -5755,12 +5737,9 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase { for (CallInst *OrigInst : VAStartInstrumentationList) { NextNodeIRBuilder IRB(OrigInst); Value *VAListTag = OrigInst->getArgOperand(0); - Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* - Value *RegSaveAreaPtrPtr = - IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - PointerType::get(RegSaveAreaPtrTy, 0)); - Value *RegSaveAreaPtr = - IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); + Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( + IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), MS.PtrTy); + Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = @@ -5855,7 +5834,7 @@ struct VarArgSystemZHelper : public VarArgHelperBase { Type *T = A->getType(); ArgKind AK = classifyArgument(T); if (AK == ArgKind::Indirect) { - T = PointerType::get(T, 0); + T = MS.PtrTy; AK = ArgKind::GeneralPurpose; } if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset) @@ -5948,8 +5927,7 @@ struct VarArgSystemZHelper : public VarArgHelperBase { if (SE != ShadowExtension::None) Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(), /*Signed*/ SE == ShadowExtension::Sign); - ShadowBase = IRB.CreateIntToPtr( - ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s"); + ShadowBase = IRB.CreateIntToPtr(ShadowBase, MS.PtrTy, "_msarg_va_s"); IRB.CreateStore(Shadow, ShadowBase); if (MS.TrackOrigins) { Value *Origin = MSV.getOrigin(A); @@ -5964,13 +5942,12 @@ struct VarArgSystemZHelper : public VarArgHelperBase { } void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) { - Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( IRB.CreateAdd( IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)), - PointerType::get(RegSaveAreaPtrTy, 0)); - Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); + MS.PtrTy); + Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = @@ -5991,14 +5968,12 @@ struct VarArgSystemZHelper : public VarArgHelperBase { // FIXME: This implementation limits OverflowOffset to kParamTLSSize, so we // don't know real overflow size and can't clear shadow beyond kParamTLSSize. void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) { - Type *OverflowArgAreaPtrTy = PointerType::getUnqual(*MS.C); // i64* Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr( IRB.CreateAdd( IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)), - PointerType::get(OverflowArgAreaPtrTy, 0)); - Value *OverflowArgAreaPtr = - IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr); + MS.PtrTy); + Value *OverflowArgAreaPtr = IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr; const Align Alignment = Align(8); std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) = From 071606ab282bb622a87759569b7044ec19a9c641 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 5 Sep 2024 18:03:11 +0800 Subject: [PATCH 202/425] [RISCV] Remove RV32 FIXMEs completed in #107290. NFC --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll | 2 -- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll | 2 -- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 2 -- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll | 2 -- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll | 2 -- 5 files changed, 10 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll index 805a3c640957b..6246ef7db0cb3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -1392,8 +1392,6 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -; FIXME: We don't match vadd.vi on RV32. - define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vadd_vx_v32i64_evl12: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll index c5dd6ac344a37..5030fda9dea33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -1405,8 +1405,6 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -; FIXME: We don't match vsadd.vi on RV32. - define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vsadd_vx_v32i64_evl12: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll index 17d9c437590a7..562399ea33e7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll @@ -1401,8 +1401,6 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -; FIXME: We don't match vsaddu.vi on RV32. - define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vsaddu_vx_v32i64_evl12: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll index 90e1b5ce55752..549c6ca11e320 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -1447,8 +1447,6 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -; FIXME: We don't match vssub.vi on RV32. - define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vssub_vx_v32i64_evl12: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll index 59899ab8b9994..683f1150310b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -1442,8 +1442,6 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -; FIXME: We don't match vssubu.vi on RV32. - define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vssubu_vx_v32i64_evl12: ; CHECK: # %bb.0: From 5ee73953f03fe0cf53190c8dc9a257c752ab4171 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 5 Sep 2024 11:10:42 +0100 Subject: [PATCH 203/425] [AMDGPU] Add image_atomic_fmin/fmax as aliases for GFX12 (#107242) This just follows SP3. --- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 4 ++++ llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index b4e58cfd98a23..5c49a8116ae7f 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1675,6 +1675,10 @@ defm IMAGE_ATOMIC_PK_ADD_BF16 : MIMG_Atomic , "image_atomic_add_flt", 0, 1>; defm IMAGE_ATOMIC_MIN_FLT : MIMG_Atomic , "image_atomic_min_num_flt", 0, 1, "image_atomic_min_flt">; defm IMAGE_ATOMIC_MAX_FLT : MIMG_Atomic , "image_atomic_max_num_flt", 0, 1, "image_atomic_max_flt">; +let AssemblerPredicate = isGFX12Plus in { + def : AMDGPUMnemonicAlias<"image_atomic_fmin", "image_atomic_min_flt">; + def : AMDGPUMnemonicAlias<"image_atomic_fmax", "image_atomic_max_flt">; +} defm IMAGE_SAMPLE : MIMG_Sampler_WQM , AMDGPUSample>; let OtherPredicates = [HasImageInsts, HasExtendedImageInsts] in { diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s index a88a3ef100fb4..fceab4b7830f9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s @@ -29,3 +29,9 @@ image_atomic_min_num_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D image_atomic_max_num_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D // GFX12: image_atomic_max_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +image_atomic_fmin v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +// GFX12: image_atomic_min_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +image_atomic_fmax v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +// GFX12: image_atomic_max_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] From d0278cf395e09bfb8dbef9cb92e6103be91e1eb3 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 5 Sep 2024 18:11:20 +0800 Subject: [PATCH 204/425] [RISCV] Remove some more completed FIXMEs from tests. NFC --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll | 1 - llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll | 2 -- 7 files changed, 8 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 776a1e9bab6b2..81fb86cd81cd3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -47,7 +47,6 @@ define <32 x i32> @insertelt_v32i32_0(<32 x i32> %a, i32 %y) { ret <32 x i32> %b } -; FIXME: Should only require an m2 slideup define <32 x i32> @insertelt_v32i32_4(<32 x i32> %a, i32 %y) { ; CHECK-LABEL: insertelt_v32i32_4: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll index 2a4fbb248cd9c..feeef73e538ae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1425,7 +1425,6 @@ define @vadd_vi_nxv32i32_evl_nx8( %va, %v } -; FIXME: The first vadd.vi should be able to infer that its AVL is equivalent to VLMAX. ; FIXME: The upper half of the operation is doing nothing but we don't catch ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) ; (the "original" %evl is the "and", due to known-bits issues with legalizing diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll index 5fdfb332da7cf..f65e708f5303c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll @@ -1062,7 +1062,6 @@ define @vmax_vx_nxv32i32_evl_nx8( %va, i3 ret %v } -; FIXME: The first vmax.vx should be able to infer that its AVL is equivalent to VLMAX. ; FIXME: The upper half of the operation is doing nothing but we don't catch ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) ; (the "original" %evl is the "and", due to known-bits issues with legalizing diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index 7d678950b7a3c..df1ad58e5ecbd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -1061,7 +1061,6 @@ define @vmaxu_vx_nxv32i32_evl_nx8( %va, i ret %v } -; FIXME: The first vmaxu.vx should be able to infer that its AVL is equivalent to VLMAX. ; FIXME: The upper half of the operation is doing nothing but we don't catch ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) ; (the "original" %evl is the "and", due to known-bits issues with legalizing diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index 98a288ed68b9a..0bf0638633aa4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -1062,7 +1062,6 @@ define @vmin_vx_nxv32i32_evl_nx8( %va, i3 ret %v } -; FIXME: The first vmin.vx should be able to infer that its AVL is equivalent to VLMAX. ; FIXME: The upper half of the operation is doing nothing but we don't catch ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) ; (the "original" %evl is the "and", due to known-bits issues with legalizing diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index 34b554b7ff514..2acebdf2e646d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -1061,7 +1061,6 @@ define @vminu_vx_nxv32i32_evl_nx8( %va, i ret %v } -; FIXME: The first vminu.vx should be able to infer that its AVL is equivalent to VLMAX. ; FIXME: The upper half of the operation is doing nothing but we don't catch ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) ; (the "original" %evl is the "and", due to known-bits issues with legalizing diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index a4d58985b75de..b7ce0e3f196f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1394,8 +1394,6 @@ define @i1_zext( %va, %vb } ; %x.i32 and %y.i32 are disjoint, so DAGCombiner will combine it into an or. -; FIXME: We should be able to recover the or into vwaddu.vv if the disjoint -; flag is set. define @vwaddu_vv_disjoint_or_add( %x.i8, %y.i8) { ; CHECK-LABEL: vwaddu_vv_disjoint_or_add: ; CHECK: # %bb.0: From 3299bc863fd74613fdfad2a2fde3f75de79bd645 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 5 Sep 2024 11:18:29 +0100 Subject: [PATCH 205/425] [DWARF] Identify prologue_end by instruction rather than DILocation (#107264) Currently, we identify the end of the prologue as being "the instruction that first has *this* DebugLoc". It works well enough, but I feel identifying a position in a function is best communicated by a MachineInstr. Plus, I've got some patches coming that depend upon this. --- llvm/include/llvm/CodeGen/DebugHandlerBase.h | 2 +- llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 23 ++++++++++---------- llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h | 6 +++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h index 85046c200ff9b..d39e7e68cb255 100644 --- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h +++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h @@ -74,7 +74,7 @@ class DebugHandlerBase { /// This location indicates end of function prologue and beginning of /// function body. - DebugLoc PrologEndLoc; + const MachineInstr *PrologEndLoc; /// This block includes epilogue instructions. const MachineBasicBlock *EpilogBeginBlock = nullptr; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 9da9382baf863..ea3fed8817d89 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2111,9 +2111,9 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { // (The new location might be an explicit line 0, which we do emit.) if (DL.getLine() == 0 && LastAsmLine == 0) return; - if (DL == PrologEndLoc) { + if (MI == PrologEndLoc) { Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT; - PrologEndLoc = DebugLoc(); + PrologEndLoc = nullptr; } // If the line changed, we call that a new statement; unless we went to // line 0 and came back, in which case it is not a new statement. We also @@ -2131,10 +2131,11 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { PrevInstLoc = DL; } -static std::pair findPrologueEndLoc(const MachineFunction *MF) { +static std::pair +findPrologueEndLoc(const MachineFunction *MF) { // First known non-DBG_VALUE and non-frame setup location marks // the beginning of the function body. - DebugLoc LineZeroLoc; + const MachineInstr *LineZeroLoc = nullptr; const Function &F = MF->getFunction(); // Some instructions may be inserted into prologue after this function. Must @@ -2151,9 +2152,9 @@ static std::pair findPrologueEndLoc(const MachineFunction *MF) { // meaningful breakpoint. If none is found, return the first // location after the frame setup. if (MI.getDebugLoc().getLine()) - return std::make_pair(MI.getDebugLoc(), IsEmptyPrologue); + return std::make_pair(&MI, IsEmptyPrologue); - LineZeroLoc = MI.getDebugLoc(); + LineZeroLoc = &MI; } IsEmptyPrologue = false; } @@ -2184,10 +2185,10 @@ static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col, Discriminator, Fn); } -DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, - unsigned CUID) { - std::pair PrologEnd = findPrologueEndLoc(&MF); - DebugLoc PrologEndLoc = PrologEnd.first; +const MachineInstr * +DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { + std::pair PrologEnd = findPrologueEndLoc(&MF); + const MachineInstr *PrologEndLoc = PrologEnd.first; bool IsEmptyPrologue = PrologEnd.second; // Get beginning of function. @@ -2206,7 +2207,7 @@ DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, CUID, getDwarfVersion(), getUnits()); return PrologEndLoc; } - return DebugLoc(); + return nullptr; } // Gather pre-function debug information. Assumes being called immediately diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 6e379396ea079..19f5b677bb8d0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -724,8 +724,10 @@ class DwarfDebug : public DebugHandlerBase { /// Emit all Dwarf sections that should come after the content. void endModule() override; - /// Emits inital debug location directive. - DebugLoc emitInitialLocDirective(const MachineFunction &MF, unsigned CUID); + /// Emits inital debug location directive. Returns instruction at which + /// the function prologue ends. + const MachineInstr *emitInitialLocDirective(const MachineFunction &MF, + unsigned CUID); /// Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; From 95684afbcd59f34be580f75ee32f766874b5d0f5 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Thu, 5 Sep 2024 12:53:57 +0200 Subject: [PATCH 206/425] [IR][ARM64EC][NFC] Clean up and document ARM64EC mangling helpers. (#107230) --- llvm/include/llvm/IR/Mangler.h | 5 +++++ llvm/lib/IR/Mangler.cpp | 39 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h index f28ffc961b6db..349f9e6e75233 100644 --- a/llvm/include/llvm/IR/Mangler.h +++ b/llvm/include/llvm/IR/Mangler.h @@ -53,7 +53,12 @@ void emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV, void emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV, const Triple &T, Mangler &M); +/// Returns the ARM64EC mangled function name unless the input is already +/// mangled. std::optional getArm64ECMangledFunctionName(StringRef Name); + +/// Returns the ARM64EC demangled function name, unless the input is not +/// mangled. std::optional getArm64ECDemangledFunctionName(StringRef Name); } // End llvm namespace diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp index e6c3ea9d56883..15a4debf191a5 100644 --- a/llvm/lib/IR/Mangler.cpp +++ b/llvm/lib/IR/Mangler.cpp @@ -291,39 +291,40 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV, } std::optional llvm::getArm64ECMangledFunctionName(StringRef Name) { - bool IsCppFn = Name[0] == '?'; - if (IsCppFn && Name.contains("$$h")) - return std::nullopt; - if (!IsCppFn && Name[0] == '#') - return std::nullopt; + if (Name[0] != '?') { + // For non-C++ symbols, prefix the name with "#" unless it's already + // mangled. + if (Name[0] == '#') + return std::nullopt; + return std::optional(("#" + Name).str()); + } - StringRef Prefix = "$$h"; - size_t InsertIdx = 0; - if (IsCppFn) { - InsertIdx = Name.find("@@"); - size_t ThreeAtSignsIdx = Name.find("@@@"); - if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { - InsertIdx += 2; - } else { - InsertIdx = Name.find("@"); - if (InsertIdx != std::string::npos) - InsertIdx++; - } + // Insert the ARM64EC "$$h" tag after the mangled function name. + if (Name.contains("$$h")) + return std::nullopt; + size_t InsertIdx = Name.find("@@"); + size_t ThreeAtSignsIdx = Name.find("@@@"); + if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { + InsertIdx += 2; } else { - Prefix = "#"; + InsertIdx = Name.find("@"); + if (InsertIdx != std::string::npos) + InsertIdx++; } return std::optional( - (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str()); + (Name.substr(0, InsertIdx) + "$$h" + Name.substr(InsertIdx)).str()); } std::optional llvm::getArm64ECDemangledFunctionName(StringRef Name) { + // For non-C++ names, drop the "#" prefix. if (Name[0] == '#') return std::optional(Name.substr(1)); if (Name[0] != '?') return std::nullopt; + // Drop the ARM64EC "$$h" tag. std::pair Pair = Name.split("$$h"); if (Pair.second.empty()) return std::nullopt; From 87b4b648585f69a2ea148e86543aa31474e59acd Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 5 Sep 2024 13:17:17 +0200 Subject: [PATCH 207/425] Fix a typo in CheckExprLifetime.cpp, NFC --- clang/lib/Sema/CheckExprLifetime.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 1482711cc2839..6ed6b8aaa7040 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -328,7 +328,7 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { // We assuments that a normal assingment operator always returns *this, that is, // an lvalue reference that is the same type as the implicit object parameter // (or the LHS for a non-member operator$=). -static bool isNormalAsisgnmentOperator(const FunctionDecl *FD) { +static bool isNormalAssignmentOperator(const FunctionDecl *FD) { OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator(); if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) { QualType RetT = FD->getReturnType(); @@ -362,7 +362,7 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { return true; } - return isNormalAsisgnmentOperator(FD); + return isNormalAssignmentOperator(FD); } // Visit lifetimebound or gsl-pointer arguments. @@ -940,10 +940,10 @@ static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) { return false; } -static bool isAssginmentOperatorLifetimeBound(CXXMethodDecl *CMD) { +static bool isAssignmentOperatorLifetimeBound(CXXMethodDecl *CMD) { if (!CMD) return false; - return isNormalAsisgnmentOperator(CMD) && CMD->param_size() == 1 && + return isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 && CMD->getParamDecl(0)->hasAttr(); } @@ -953,7 +953,7 @@ static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef, diag::warn_dangling_lifetime_pointer_assignment, SourceLocation()); return (EnableGSLAssignmentWarnings && (isRecordWithAttr(Entity.LHS->getType()) || - isAssginmentOperatorLifetimeBound(Entity.AssignmentOperator))); + isAssignmentOperatorLifetimeBound(Entity.AssignmentOperator))); } static void checkExprLifetimeImpl(Sema &SemaRef, From 3e070906eff720dc44aee86e533e12aafc8bb14b Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 5 Sep 2024 13:24:38 +0200 Subject: [PATCH 208/425] Fix llvm-else-after-return clang-tidy warning in CheckExprLifetime.cpp, NFC --- clang/lib/Sema/CheckExprLifetime.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 6ed6b8aaa7040..8f4d5d50669f1 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -288,7 +288,8 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) { // Map and set types. .Cases("find", "equal_range", "lower_bound", "upper_bound", true) .Default(false); - } else if (Callee->getReturnType()->isReferenceType()) { + } + if (Callee->getReturnType()->isReferenceType()) { if (!Callee->getIdentifier()) { auto OO = Callee->getOverloadedOperator(); return OO == OverloadedOperatorKind::OO_Subscript || @@ -316,7 +317,8 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { .Cases("end", "rend", "cend", "crend", true) .Case("data", true) .Default(false); - } else if (FD->getReturnType()->isReferenceType()) { + } + if (FD->getReturnType()->isReferenceType()) { return llvm::StringSwitch(FD->getName()) .Cases("get", "any_cast", true) .Default(false); From 07bef02831836748f46ddd285520f351fe18cfe9 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 5 Sep 2024 12:30:20 +0100 Subject: [PATCH 209/425] [OpenMPOpt] Initialize OpenMPIRBuilderConfig::IsGPU flag (#104456) This patch ensures the `IsGPU` flag is set by the OpenMPOpt pass, so that it can be relied upon by `OpenMPIRBuilder` methods when called by that pass as well. Since currently there are very limited callers for the `OpenMPIRBuilder::isGPU()` method, no assertions are being triggered by the lack of initialization of this flag. However, when more offloading-related features are implemented, it will eventually start happening. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 7c1489f37049f..cd94661bbe07f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -286,6 +286,19 @@ struct OMPInformationCache : public InformationCache { OpenMPPostLink(OpenMPPostLink) { OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M); + const Triple T(OMPBuilder.M.getTargetTriple()); + switch (T.getArch()) { + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + case llvm::Triple::amdgcn: + assert(OMPBuilder.Config.IsTargetDevice && + "OpenMP AMDGPU/NVPTX is only prepared to deal with device code."); + OMPBuilder.Config.IsGPU = true; + break; + default: + OMPBuilder.Config.IsGPU = false; + break; + } OMPBuilder.initialize(); initializeRuntimeFunctions(M); initializeInternalControlVars(); From 142433684a6e3a2936f814268396dea4190905dc Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Thu, 5 Sep 2024 12:44:10 +0100 Subject: [PATCH 210/425] [OpenMP][Flang] Fix dynamic-extent array mapping (#107247) This patch fixes the mapping and lowering of arrays with dynamic extents and adds a new test for the same. The fix discards the incomplete the dynamic extent information and replacing it with just the base type. When lowering to llvm later, the bounds information is used instead. --- flang/lib/Lower/OpenMP/Utils.cpp | 7 ++++ flang/test/Lower/OpenMP/array-bounds.f90 | 2 +- flang/test/Lower/OpenMP/target.f90 | 2 +- .../offloading/fortran/target-map-dynamic.f90 | 33 +++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 offload/test/offloading/fortran/target-map-dynamic.f90 diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index bbf08961bee66..8073b24a1d5b4 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -132,6 +132,13 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, mlir::TypeAttr varType = mlir::TypeAttr::get( llvm::cast(retTy).getElementType()); + // For types with unknown extents such as <2x?xi32> we discard the incomplete + // type info and only retain the base type. The correct dimensions are later + // recovered through the bounds info. + if (auto seqType = llvm::dyn_cast(varType.getValue())) + if (seqType.hasDynamicExtents()) + varType = mlir::TypeAttr::get(seqType.getEleTy()); + mlir::omp::MapInfoOp op = builder.create( loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds, builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 index 9e59d71f560fc..09498ca6cdde9 100644 --- a/flang/test/Lower/OpenMP/array-bounds.f90 +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -74,7 +74,7 @@ end subroutine assumed_shape_array !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index !HOST: %[[EXT:.*]] = arith.addi %[[C4_1]], %c1{{.*}} : index !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%c1{{.*}} : index) upper_bound(%c4{{.*}} : index) extent(%[[EXT]] : index) stride(%[[DIMS0]]#2 : index) start_idx(%c1{{.*}} : index) {stride_in_bytes = true} -!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref>, !fir.array) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref>, i32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} !HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.ref>, !fir.ref) { subroutine assumed_size_array(arr_read_write) integer, intent(inout) :: arr_read_write(*) diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index e86a2f9b6098d..6fccea7e37072 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -460,7 +460,7 @@ subroutine omp_target_implicit_bounds(n) integer :: n integer :: a(n) !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[VAL_7]] : index) stride(%c1{{.*}} : index) start_idx(%c1{{.*}} : index) - !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref>, !fir.array) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref> {name = "a"} + !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref>, i32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref> {name = "a"} !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref>, !fir.ref) { !CHECK: ^bb0(%[[VAL_17]]: !fir.ref>, %[[VAL_18]]: !fir.ref): diff --git a/offload/test/offloading/fortran/target-map-dynamic.f90 b/offload/test/offloading/fortran/target-map-dynamic.f90 new file mode 100644 index 0000000000000..8bf6980884a18 --- /dev/null +++ b/offload/test/offloading/fortran/target-map-dynamic.f90 @@ -0,0 +1,33 @@ +! Offloading test checking lowering of arrays with dynamic extents. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +subroutine test_array_target_enter_data(dims) + integer, intent(in) :: dims(2) + double precision :: A(2, dims(2)) + !$omp target enter data map(to: U) + + A(2,2) = 1.0 + !$omp target + A(1,1) = 10 + A(2,1) = 20 + A(1,2) = 30 + A(2,2) = 40 + !$omp end target + + !$omp target exit data map(from: A) + + print *, A +end subroutine test_array_target_enter_data + +program main + integer :: dimensions(2) + dimensions(1) = 1 + dimensions(2) = 2 + +call test_array_target_enter_data(dimensions) +end program + + +! CHECK: 10. 20. 30. 40. From b44d9e5d3605d7ddd64992e3c77b6669f0f7701b Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 5 Sep 2024 12:50:21 +0100 Subject: [PATCH 211/425] VPlanTransforms: fix style after cursory reading (NFC) (#105827) --- .../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9796ee64f6ef9..39f7bf55ee5eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1072,10 +1072,11 @@ void VPlanTransforms::truncateToMinimalBitwidths( ResultVPV->replaceAllUsesWith(Ext); Ext->setOperand(0, ResultVPV); assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); - } else + } else { assert( match(&R, m_Binary(m_VPValue(), m_VPValue())) && "Only ICmps should not need extending the result."); + } assert(!isa(&R) && "stores cannot be narrowed"); if (isa(&R)) @@ -1214,7 +1215,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. - auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); LaneMaskPhi->insertAfter(CanonicalIVPHI); // Create the active lane mask for the next iteration of the loop before the @@ -1290,7 +1291,7 @@ void VPlanTransforms::addActiveLaneMask( "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - auto FoundWidenCanonicalIVUser = + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), [](VPUser *U) { return isa(U); }); assert(FoundWidenCanonicalIVUser && @@ -1440,14 +1441,13 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // Collect recipes in the backward slice of `Root` that may generate a poison // value that is used after vectorization. SmallPtrSet Visited; - auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { + auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { SmallVector Worklist; Worklist.push_back(Root); // Traverse the backward slice of Root through its use-def chain. while (!Worklist.empty()) { - VPRecipeBase *CurRec = Worklist.back(); - Worklist.pop_back(); + VPRecipeBase *CurRec = Worklist.pop_back_val(); if (!Visited.insert(CurRec).second) continue; @@ -1493,8 +1493,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } // Add new definitions to the worklist. - for (VPValue *operand : CurRec->operands()) - if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) + for (VPValue *Operand : CurRec->operands()) + if (VPRecipeBase *OpDef = Operand->getDefiningRecipe()) Worklist.push_back(OpDef); } }); @@ -1510,7 +1510,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); if (AddrDef && WidenRec->isConsecutive() && BlockNeedsPredication(UnderlyingInstr.getParent())) - collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); + CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } else if (auto *InterleaveRec = dyn_cast(&Recipe)) { VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); if (AddrDef) { @@ -1526,7 +1526,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } if (NeedPredication) - collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); + CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } } } From fa385274baae77a0ea7e78c4c6feca6b0ab4f1dc Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 5 Sep 2024 13:53:19 +0200 Subject: [PATCH 212/425] [libc++] Add ABI tests for unordered_{map,set} (#107200) These are used to ensure #76756 is correct. --- .../unord.map/abi.compile.pass.cpp | 134 ++++++++++++++++++ .../unord.set/abi.compile.pass.cpp | 132 +++++++++++++++++ libcxx/utils/libcxx/test/features.py | 1 + 3 files changed, 267 insertions(+) create mode 100644 libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp create mode 100644 libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp diff --git a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp new file mode 100644 index 0000000000000..c8e5ba0ec899e --- /dev/null +++ b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type + +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_macros.h" + +template +class small_pointer { + std::uint16_t offset; +}; + +template +class small_iter_allocator { +public: + using value_type = T; + using pointer = small_pointer; + using size_type = std::uint16_t; + using difference_type = std::int16_t; + + small_iter_allocator() TEST_NOEXCEPT {} + + template + small_iter_allocator(small_iter_allocator) TEST_NOEXCEPT {} + + T* allocate(std::size_t n); + void deallocate(T* p, std::size_t); + + friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; } + friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; } +}; + +template +class final_small_iter_allocator final { +public: + using value_type = T; + using pointer = small_pointer; + using size_type = std::uint16_t; + using difference_type = std::int16_t; + + final_small_iter_allocator() TEST_NOEXCEPT {} + + template + final_small_iter_allocator(final_small_iter_allocator) TEST_NOEXCEPT {} + + T* allocate(std::size_t n); + void deallocate(T* p, std::size_t); + + friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; } + friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; } +}; + +template +using unordered_map_alloc = std::unordered_map, std::equal_to, Alloc>; + +#if __SIZE_WIDTH__ == 64 + +static_assert(sizeof(unordered_map_alloc > >) == 40, ""); +static_assert(sizeof(unordered_map_alloc > >) == 40, ""); +static_assert(sizeof(unordered_map_alloc > >) == 64, ""); +static_assert(sizeof(unordered_map_alloc > >) == 12, ""); +static_assert(sizeof(unordered_map_alloc > >) == 16, ""); + +static_assert(sizeof(unordered_map_alloc > >) == 40, ""); +static_assert(sizeof(unordered_map_alloc > >) == 40, ""); +static_assert(sizeof(unordered_map_alloc > >) == 64, ""); +static_assert(sizeof(unordered_map_alloc > >) == 12, ""); +static_assert(sizeof(unordered_map_alloc > >) == 16, ""); + +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); + +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, + ""); + +struct TEST_ALIGNAS(32) AlignedHash {}; +struct UnalignedEqualTo {}; + +static_assert(sizeof(std::unordered_map) == 96, ""); +static_assert(TEST_ALIGNOF(std::unordered_map) == 32, ""); + +#elif __SIZE_WIDTH__ == 32 + +static_assert(sizeof(unordered_map_alloc > >) == 20, ""); +static_assert(sizeof(unordered_map_alloc > >) == 20, ""); +static_assert(sizeof(unordered_map_alloc > >) == 44, ""); +static_assert(sizeof(unordered_map_alloc > >) == 12, ""); +static_assert(sizeof(unordered_map_alloc > >) == 16, ""); + +static_assert(sizeof(unordered_map_alloc > >) == 20, ""); +static_assert(sizeof(unordered_map_alloc > >) == 20, ""); +static_assert(sizeof(unordered_map_alloc > >) == 44, ""); +static_assert(sizeof(unordered_map_alloc > >) == 12, ""); +static_assert(sizeof(unordered_map_alloc > >) == 16, ""); + +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); + +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_map_alloc > >) == 4, + ""); + +struct TEST_ALIGNAS(32) AlignedHash {}; +struct UnalignedEqualTo {}; + +static_assert(sizeof(std::unordered_map) == 96); +static_assert(TEST_ALIGNOF(std::unordered_map) == 32); + +#else +# error std::size_t has an unexpected size +#endif diff --git a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp new file mode 100644 index 0000000000000..359e248ff7a4f --- /dev/null +++ b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type + +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_macros.h" + +template +class small_pointer { + std::uint16_t offset; +}; + +template +class small_iter_allocator { +public: + using value_type = T; + using pointer = small_pointer; + using size_type = std::uint16_t; + using difference_type = std::int16_t; + + small_iter_allocator() TEST_NOEXCEPT {} + + template + small_iter_allocator(small_iter_allocator) TEST_NOEXCEPT {} + + T* allocate(std::size_t n); + void deallocate(T* p, std::size_t); + + friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; } + friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; } +}; + +template +class final_small_iter_allocator final { +public: + using value_type = T; + using pointer = small_pointer; + using size_type = std::uint16_t; + using difference_type = std::int16_t; + + final_small_iter_allocator() TEST_NOEXCEPT {} + + template + final_small_iter_allocator(final_small_iter_allocator) TEST_NOEXCEPT {} + + T* allocate(std::size_t n); + void deallocate(T* p, std::size_t); + + friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; } + friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; } +}; + +template +using unordered_set_alloc = std::unordered_set, std::equal_to, Alloc>; + +#if __SIZE_WIDTH__ == 64 + +static_assert(sizeof(unordered_set_alloc >) == 40, ""); +static_assert(sizeof(unordered_set_alloc >) == 40, ""); +static_assert(sizeof(unordered_set_alloc >) == 64, ""); +static_assert(sizeof(unordered_set_alloc >) == 12, ""); +static_assert(sizeof(unordered_set_alloc >) == 16, ""); + +static_assert(sizeof(unordered_set_alloc >) == 40, ""); +static_assert(sizeof(unordered_set_alloc >) == 40, ""); +static_assert(sizeof(unordered_set_alloc >) == 64, ""); +static_assert(sizeof(unordered_set_alloc >) == 12, ""); +static_assert(sizeof(unordered_set_alloc >) == 16, ""); + +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); + +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 8, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); + +struct TEST_ALIGNAS(32) AlignedHash {}; +struct UnalignedEqualTo {}; + +static_assert(sizeof(std::unordered_set) == 96, ""); +static_assert(TEST_ALIGNOF(std::unordered_set) == 32, ""); + +#elif __SIZE_WIDTH__ == 32 + +static_assert(sizeof(unordered_set_alloc >) == 20, ""); +static_assert(sizeof(unordered_set_alloc >) == 20, ""); +static_assert(sizeof(unordered_set_alloc >) == 44, ""); +static_assert(sizeof(unordered_set_alloc >) == 12, ""); +static_assert(sizeof(unordered_set_alloc >) == 16, ""); + +static_assert(sizeof(unordered_set_alloc >) == 20, ""); +static_assert(sizeof(unordered_set_alloc >) == 20, ""); +static_assert(sizeof(unordered_set_alloc >) == 44, ""); +static_assert(sizeof(unordered_set_alloc >) == 12, ""); +static_assert(sizeof(unordered_set_alloc >) == 16, ""); + +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); + +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); +static_assert(TEST_ALIGNOF(unordered_set_alloc >) == 4, ""); + +struct TEST_ALIGNAS(32) AlignedHash {}; +struct UnalignedEqualTo {}; + +static_assert(sizeof(std::unordered_set) == 96); +static_assert(TEST_ALIGNOF(std::unordered_set) == 32); + +#else +# error std::size_t has an unexpected size +#endif diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 6857a28eb3299..2cd04124a6ca5 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -374,6 +374,7 @@ def _mingwSupportsModules(cfg): "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators", "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string", "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector", + "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type", "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor", "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem", "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device", From 11040560ba30381ed47c3089a2562a41b00dbb4b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Sep 2024 12:52:00 +0100 Subject: [PATCH 213/425] [X86] preferABDSToABSWithNSW - use ABDS for i32/i64 if we have CMOV Now that we have better ABDS lowering, prefer cmov(sub(x,y),sub(y,x)) to cmov(abs(sub(x,y)),sub(x,y)) to improve ILP --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/test/CodeGen/X86/abds-neg.ll | 36 ++++++++++++--------- llvm/test/CodeGen/X86/abds.ll | 42 +++++++++++++------------ 3 files changed, 45 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 451881e1d6141..092a7192929fd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58210,7 +58210,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, } bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const { - return false; + return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64); } // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS. diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index 833273dc98243..f837f49abf7a4 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -1027,19 +1027,22 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind { define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_subnsw_i32: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: cmovnsl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32: ; X64: # %bb.0: -; X64-NEXT: subl %esi, %edi ; X64-NEXT: movl %edi, %eax +; X64-NEXT: subl %esi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: negl %eax -; X64-NEXT: cmovnsl %edi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false) @@ -1050,19 +1053,22 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_subnsw_i32_undef: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: cmovnsl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32_undef: ; X64: # %bb.0: -; X64-NEXT: subl %esi, %edi ; X64-NEXT: movl %edi, %eax +; X64-NEXT: subl %esi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: negl %eax -; X64-NEXT: cmovnsl %edi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true) @@ -1090,10 +1096,11 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; ; X64-LABEL: abd_subnsw_i64: ; X64: # %bb.0: -; X64-NEXT: subq %rsi, %rdi ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: subq %rsi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: negq %rax -; X64-NEXT: cmovnsq %rdi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false) @@ -1121,10 +1128,11 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; ; X64-LABEL: abd_subnsw_i64_undef: ; X64: # %bb.0: -; X64-NEXT: subq %rsi, %rdi ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: subq %rsi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: negq %rax -; X64-NEXT: cmovnsq %rdi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true) diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index d9ba140032b31..9476fd14306fe 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -928,19 +928,20 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind { define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_subnsw_i32: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: negl %eax -; X86-NEXT: cmovsl %ecx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32: ; X64: # %bb.0: -; X64-NEXT: subl %esi, %edi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %edi, %eax +; X64-NEXT: subl %esi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false) @@ -950,19 +951,20 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_subnsw_i32_undef: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: negl %eax -; X86-NEXT: cmovsl %ecx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32_undef: ; X64: # %bb.0: -; X64-NEXT: subl %esi, %edi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %edi, %eax +; X64-NEXT: subl %esi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true) @@ -986,10 +988,10 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; ; X64-LABEL: abd_subnsw_i64: ; X64: # %bb.0: -; X64-NEXT: subq %rsi, %rdi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: negq %rax -; X64-NEXT: cmovsq %rdi, %rax +; X64-NEXT: subq %rsi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false) @@ -1013,10 +1015,10 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; ; X64-LABEL: abd_subnsw_i64_undef: ; X64: # %bb.0: -; X64-NEXT: subq %rsi, %rdi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: negq %rax -; X64-NEXT: cmovsq %rdi, %rax +; X64-NEXT: subq %rsi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true) From 84cf3a573e89b18ce79ff32a7646c0a99729029c Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 5 Sep 2024 13:23:08 +0100 Subject: [PATCH 214/425] [Clang] CWG2749: relational operators involving pointers to void (#93046) https://cplusplus.github.io/CWG/issues/2749.html This DR's effects are backported to C++98. Does not affect C where integral constant expressions cannot involve pointers. --------- Co-authored-by: Vlad Serebrennikov Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 4 ++ .../include/clang/Basic/DiagnosticASTKinds.td | 2 - clang/lib/AST/ExprConstant.cpp | 10 ---- clang/test/AST/ByteCode/literals.cpp | 8 +--- clang/test/CXX/drs/cwg27xx.cpp | 46 ++++++++++++++++++- clang/test/CXX/expr/expr.const/p2-0x.cpp | 11 +++-- clang/www/cxx_dr_status.html | 2 +- 7 files changed, 58 insertions(+), 25 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bd84a2e40fb8b..dc103aceebc36 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -154,6 +154,10 @@ Resolutions to C++ Defect Reports - Allow ``void{}`` as a prvalue of type ``void``. (`CWG2351: void{} `_). +- Clang now allows comparing unequal object pointers that have been cast to ``void *`` + in constant expressions. These comparisons always worked in non-constant expressions. + (`CWG2749: Treatment of "pointer to void" for relational comparisons `_). + C Language Changes ------------------ diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td index 45ad84831589b..91135b18c7571 100644 --- a/clang/include/clang/Basic/DiagnosticASTKinds.td +++ b/clang/include/clang/Basic/DiagnosticASTKinds.td @@ -148,8 +148,6 @@ def note_constexpr_var_init_weak : Note< def note_constexpr_typeid_polymorphic : Note< "typeid applied to expression of polymorphic type %0 is " "not allowed in a constant expression in C++ standards before C++20">; -def note_constexpr_void_comparison : Note< - "comparison between unequal pointers to void has unspecified result">; def note_constexpr_temporary_here : Note<"temporary created here">; def note_constexpr_dynamic_alloc_here : Note<"heap allocation performed here">; def note_constexpr_conditional_never_const : Note< diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3dc13c14c0034..205cbdf52a6f7 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13895,16 +13895,6 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E, SubobjectDesignator &LHSDesignator = LHSValue.getLValueDesignator(); SubobjectDesignator &RHSDesignator = RHSValue.getLValueDesignator(); - // C++11 [expr.rel]p3: - // Pointers to void (after pointer conversions) can be compared, with a - // result defined as follows: If both pointers represent the same - // address or are both the null pointer value, the result is true if the - // operator is <= or >= and false otherwise; otherwise the result is - // unspecified. - // We interpret this as applying to pointers to *cv* void. - if (LHSTy->isVoidPointerType() && LHSOffset != RHSOffset && IsRelational) - Info.CCEDiag(E, diag::note_constexpr_void_comparison); - // C++11 [expr.rel]p2: // - If two pointers point to non-static data members of the same object, // or to subobjects or array elements fo such members, recursively, the diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp index 2329d4d973f01..13d6c4feb3500 100644 --- a/clang/test/AST/ByteCode/literals.cpp +++ b/clang/test/AST/ByteCode/literals.cpp @@ -199,12 +199,8 @@ namespace PointerComparison { constexpr bool v3 = null == pv; // ok constexpr bool v4 = qv == pv; // ok - /// FIXME: These two are rejected by the current interpreter, but - /// accepted by GCC. - constexpr bool v5 = qv >= pv; // ref-error {{constant expression}} \ - // ref-note {{unequal pointers to void}} - constexpr bool v8 = qv > (void*)&s.a; // ref-error {{constant expression}} \ - // ref-note {{unequal pointers to void}} + constexpr bool v5 = qv >= pv; + constexpr bool v8 = qv > (void*)&s.a; constexpr bool v6 = qv > null; // both-error {{must be initialized by a constant expression}} \ // both-note {{comparison between '&s.b' and 'nullptr' has unspecified value}} diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp index 406c8ea41f3b2..b3867696c615b 100644 --- a/clang/test/CXX/drs/cwg27xx.cpp +++ b/clang/test/CXX/drs/cwg27xx.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -pedantic-errors -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -pedantic-errors -verify=expected,cxx98 %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -pedantic-errors -verify=expected %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++14 -pedantic-errors -verify=expected %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -pedantic-errors -verify=expected %s @@ -6,6 +6,29 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -pedantic-errors -verify=expected,since-cxx23 %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2c -pedantic-errors -verify=expected,since-cxx23,since-cxx26 %s +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + +#if __cplusplus == 199711L +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) +#else +#define __enable_constant_folding +#endif + +namespace std { +#if __cplusplus >= 202002L + struct strong_ordering { + int n; + constexpr operator int() const { return n; } + static const strong_ordering less, equal, greater; + }; + constexpr strong_ordering strong_ordering::less{-1}, + strong_ordering::equal{0}, strong_ordering::greater{1}; +#endif +} // namespace std + namespace cwg2718 { // cwg2718: 2.7 struct B {}; struct D; @@ -18,6 +41,27 @@ void f(B b) { struct D : B {}; } // namespace cwg2718 +namespace cwg2749 { // cwg2749: 20 + +extern int x[2]; +struct Y { + int i; + int j; +}; +extern Y y[2]; + +static_assert(__enable_constant_folding(static_cast(&x[0]) < static_cast(&x[1])), ""); +static_assert(__enable_constant_folding(static_cast(&y[0].i) < static_cast(&y[0].j)), ""); +static_assert(__enable_constant_folding(static_cast(&y[0].j) < static_cast(&y[1].i)), ""); + +#if __cplusplus >= 202002L +static_assert((static_cast(&x[0]) <=> static_cast(&x[1])) == std::strong_ordering::less); +static_assert((static_cast(&y[0].i) <=> static_cast(&y[0].j)) == std::strong_ordering::less); +static_assert((static_cast(&y[0].j) <=> static_cast(&y[1].i)) == std::strong_ordering::less); +#endif + +} // namespace cwg2749 + namespace cwg2759 { // cwg2759: 19 #if __cplusplus >= 201103L diff --git a/clang/test/CXX/expr/expr.const/p2-0x.cpp b/clang/test/CXX/expr/expr.const/p2-0x.cpp index e3cd057baba75..767eee1c74f05 100644 --- a/clang/test/CXX/expr/expr.const/p2-0x.cpp +++ b/clang/test/CXX/expr/expr.const/p2-0x.cpp @@ -571,18 +571,19 @@ namespace UnspecifiedRelations { // [expr.rel]p3: Pointers to void can be compared [...] if both pointers // represent the same address or are both the null pointer [...]; otherwise // the result is unspecified. + // Same address restriction removed by CWG2749 struct S { int a, b; } s; constexpr void *null = 0; constexpr void *pv = (void*)&s.a; constexpr void *qv = (void*)&s.b; constexpr bool v1 = null < (int*)0; constexpr bool v2 = null < pv; // expected-error {{constant expression}} expected-note {{comparison between 'nullptr' and '&s.a' has unspecified value}} - constexpr bool v3 = null == pv; // ok - constexpr bool v4 = qv == pv; // ok - constexpr bool v5 = qv >= pv; // expected-error {{constant expression}} expected-note {{unequal pointers to void}} + constexpr bool v3 = null == pv; + constexpr bool v4 = qv == pv; + constexpr bool v5 = qv >= pv; constexpr bool v6 = qv > null; // expected-error {{constant expression}} expected-note {{comparison between '&s.b' and 'nullptr' has unspecified value}} - constexpr bool v7 = qv <= (void*)&s.b; // ok - constexpr bool v8 = qv > (void*)&s.a; // expected-error {{constant expression}} expected-note {{unequal pointers to void}} + constexpr bool v7 = qv <= (void*)&s.b; + constexpr bool v8 = qv > (void*)&s.a; } // - an assignment or a compound assignment (5.17); or diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index aa79c3706f32b..b638f0ff30bcc 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -16341,7 +16341,7 @@

C++ defect report implementation status

2749 DRWP Treatment of "pointer to void" for relational comparisons - Unknown + Clang 20 2750 From d6832a611a7c4ec36f08b1cfe9af850dad32da2e Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 5 Sep 2024 08:28:33 -0400 Subject: [PATCH 215/425] [libc++][modules] Modularize (#107254) Many headers include `` just for size_t, and pulling in additional content (e.g. the traits used for std::byte) is unnecessary. To solve this problem, this patch splits up `` into subcomponents so that headers can include only the parts that they actually require. This has the added benefit of making the modules build a lot stricter with respect to IWYU, and also providing a canonical location where we define `std::size_t` and friends (which were previously defined in multiple headers like `` and ``). After this patch, there's still many places in the codebase where we include `` when `<__cstddef/size_t.h>` would be sufficient. This patch focuses on removing `` includes from __type_traits to make these headers non-circular with ``. Additional refactorings can be tackled separately. --- libcxx/include/CMakeLists.txt | 5 ++ libcxx/include/__algorithm/ranges_minmax.h | 1 + libcxx/include/__atomic/atomic.h | 2 + libcxx/include/__charconv/to_chars_integral.h | 1 + libcxx/include/__cstddef/byte.h | 84 +++++++++++++++++++ libcxx/include/__cstddef/max_align_t.h | 27 ++++++ libcxx/include/__cstddef/nullptr_t.h | 25 ++++++ libcxx/include/__cstddef/ptrdiff_t.h | 25 ++++++ libcxx/include/__cstddef/size_t.h | 25 ++++++ libcxx/include/__exception/nested_exception.h | 2 + libcxx/include/__fwd/array.h | 3 +- libcxx/include/__fwd/complex.h | 2 +- libcxx/include/__fwd/pair.h | 2 +- libcxx/include/__fwd/span.h | 2 +- libcxx/include/__fwd/subrange.h | 2 +- libcxx/include/__fwd/tuple.h | 2 +- libcxx/include/__iterator/concepts.h | 1 + libcxx/include/__iterator/iterator_traits.h | 2 + libcxx/include/__iterator/wrap_iter.h | 1 + libcxx/include/__mdspan/layout_stride.h | 1 + libcxx/include/__memory/pointer_traits.h | 1 + libcxx/include/__memory/shared_ptr.h | 2 + libcxx/include/__memory/unique_ptr.h | 1 + libcxx/include/__memory/uses_allocator.h | 1 + .../__random/mersenne_twister_engine.h | 1 + libcxx/include/__random/seed_seq.h | 2 + .../__random/subtract_with_carry_engine.h | 1 + libcxx/include/__ranges/subrange.h | 1 + .../include/__string/constexpr_c_functions.h | 2 + libcxx/include/__tuple/tuple_size.h | 2 + .../include/__type_traits/aligned_storage.h | 2 +- libcxx/include/__type_traits/aligned_union.h | 2 +- libcxx/include/__type_traits/alignment_of.h | 2 +- libcxx/include/__type_traits/extent.h | 2 +- libcxx/include/__type_traits/is_allocator.h | 2 +- libcxx/include/__type_traits/is_array.h | 2 +- .../include/__type_traits/is_bounded_array.h | 2 +- .../__type_traits/is_nothrow_destructible.h | 2 +- .../include/__type_traits/is_null_pointer.h | 2 +- libcxx/include/__type_traits/is_swappable.h | 2 +- libcxx/include/__type_traits/rank.h | 2 +- .../__type_traits/remove_all_extents.h | 2 +- libcxx/include/__type_traits/remove_extent.h | 2 +- libcxx/include/__type_traits/type_list.h | 2 +- libcxx/include/__utility/in_place.h | 1 + libcxx/include/__utility/pair.h | 1 + libcxx/include/__utility/swap.h | 1 + libcxx/include/any | 1 + libcxx/include/cstddef | 83 ++---------------- libcxx/include/cstdio | 2 +- libcxx/include/cstdlib | 2 +- libcxx/include/cstring | 2 +- libcxx/include/ctime | 2 +- libcxx/include/cuchar | 2 +- libcxx/include/cwchar | 2 +- .../include/experimental/__simd/reference.h | 1 + libcxx/include/experimental/__simd/scalar.h | 1 + libcxx/include/experimental/__simd/simd.h | 1 + .../include/experimental/__simd/simd_mask.h | 1 + libcxx/include/experimental/__simd/vec_ext.h | 1 + libcxx/include/module.modulemap | 31 +++++-- libcxx/include/typeinfo | 1 + libcxx/include/unordered_map | 2 + libcxx/include/unordered_set | 2 + .../test/libcxx/transitive_includes/cxx03.csv | 2 - .../test/libcxx/transitive_includes/cxx11.csv | 2 - .../test/libcxx/transitive_includes/cxx14.csv | 2 - .../test/libcxx/transitive_includes/cxx17.csv | 2 - .../test/libcxx/transitive_includes/cxx20.csv | 2 - .../test/libcxx/transitive_includes/cxx23.csv | 2 - .../test/libcxx/transitive_includes/cxx26.csv | 2 - .../bit/bit.pow.two/bit_ceil.pass.cpp | 1 + .../bit/bit.pow.two/bit_floor.pass.cpp | 1 + .../bit/bit.pow.two/bit_width.pass.cpp | 1 + .../bit/bit.pow.two/has_single_bit.pass.cpp | 1 + .../bit/bitops.count/countl_one.pass.cpp | 1 + .../bit/bitops.count/countl_zero.pass.cpp | 1 + .../bit/bitops.count/countr_one.pass.cpp | 1 + .../bit/bitops.count/countr_zero.pass.cpp | 1 + .../bit/bitops.count/popcount.pass.cpp | 1 + .../std/numerics/bit/bitops.rot/rotl.pass.cpp | 1 + .../std/numerics/bit/bitops.rot/rotr.pass.cpp | 1 + .../header_exportable_declarations.cpp | 4 +- libcxx/utils/libcxx/test/modules.py | 4 + 84 files changed, 306 insertions(+), 124 deletions(-) create mode 100644 libcxx/include/__cstddef/byte.h create mode 100644 libcxx/include/__cstddef/max_align_t.h create mode 100644 libcxx/include/__cstddef/nullptr_t.h create mode 100644 libcxx/include/__cstddef/ptrdiff_t.h create mode 100644 libcxx/include/__cstddef/size_t.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 210beaf5a3364..0f43916dae438 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -324,6 +324,11 @@ set(files __coroutine/coroutine_traits.h __coroutine/noop_coroutine_handle.h __coroutine/trivial_awaitables.h + __cstddef/byte.h + __cstddef/max_align_t.h + __cstddef/nullptr_t.h + __cstddef/ptrdiff_t.h + __cstddef/size_t.h __debug_utils/randomize_range.h __debug_utils/sanitizers.h __debug_utils/strict_weak_ordering_check.h diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h index 1b43b1e19cdec..5f2e5cb2a1eea 100644 --- a/libcxx/include/__algorithm/ranges_minmax.h +++ b/libcxx/include/__algorithm/ranges_minmax.h @@ -24,6 +24,7 @@ #include <__ranges/access.h> #include <__ranges/concepts.h> #include <__type_traits/desugars_to.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_reference.h> #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h index bcea21f5ce2e1..af6d12b5e4ce9 100644 --- a/libcxx/include/__atomic/atomic.h +++ b/libcxx/include/__atomic/atomic.h @@ -16,8 +16,10 @@ #include <__config> #include <__functional/operations.h> #include <__memory/addressof.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_floating_point.h> #include <__type_traits/is_function.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h index 0369f4dfb9bda..ccb5856df1799 100644 --- a/libcxx/include/__charconv/to_chars_integral.h +++ b/libcxx/include/__charconv/to_chars_integral.h @@ -21,6 +21,7 @@ #include <__system_error/errc.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_same.h> #include <__type_traits/make_32_64_or_128_bit.h> #include <__type_traits/make_unsigned.h> diff --git a/libcxx/include/__cstddef/byte.h b/libcxx/include/__cstddef/byte.h new file mode 100644 index 0000000000000..b8cfe5e8d1c7e --- /dev/null +++ b/libcxx/include/__cstddef/byte.h @@ -0,0 +1,84 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___CSTDDEF_BYTE_H +#define _LIBCPP___CSTDDEF_BYTE_H + +#include <__config> +#include <__type_traits/enable_if.h> +#include <__type_traits/is_integral.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if _LIBCPP_STD_VER >= 17 +namespace std { // purposefully not versioned + +enum class byte : unsigned char {}; + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept { + return static_cast( + static_cast(static_cast(__lhs) | static_cast(__rhs))); +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept { + return __lhs = __lhs | __rhs; +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator&(byte __lhs, byte __rhs) noexcept { + return static_cast( + static_cast(static_cast(__lhs) & static_cast(__rhs))); +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept { + return __lhs = __lhs & __rhs; +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator^(byte __lhs, byte __rhs) noexcept { + return static_cast( + static_cast(static_cast(__lhs) ^ static_cast(__rhs))); +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept { + return __lhs = __lhs ^ __rhs; +} + +_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator~(byte __b) noexcept { + return static_cast(static_cast(~static_cast(__b))); +} + +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr byte& operator<<=(byte& __lhs, _Integer __shift) noexcept { + return __lhs = __lhs << __shift; +} + +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr byte operator<<(byte __lhs, _Integer __shift) noexcept { + return static_cast(static_cast(static_cast(__lhs) << __shift)); +} + +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr byte& operator>>=(byte& __lhs, _Integer __shift) noexcept { + return __lhs = __lhs >> __shift; +} + +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr byte operator>>(byte __lhs, _Integer __shift) noexcept { + return static_cast(static_cast(static_cast(__lhs) >> __shift)); +} + +template ::value, int> = 0> +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Integer to_integer(byte __b) noexcept { + return static_cast<_Integer>(__b); +} + +} // namespace std +#endif // _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___CSTDDEF_BYTE_H diff --git a/libcxx/include/__cstddef/max_align_t.h b/libcxx/include/__cstddef/max_align_t.h new file mode 100644 index 0000000000000..7c09c7e7f3017 --- /dev/null +++ b/libcxx/include/__cstddef/max_align_t.h @@ -0,0 +1,27 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___CSTDDEF_MAX_ALIGN_T_H +#define _LIBCPP___CSTDDEF_MAX_ALIGN_T_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if !defined(_LIBCPP_CXX03_LANG) +using ::max_align_t _LIBCPP_USING_IF_EXISTS; +#endif + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___CSTDDEF_MAX_ALIGN_T_H diff --git a/libcxx/include/__cstddef/nullptr_t.h b/libcxx/include/__cstddef/nullptr_t.h new file mode 100644 index 0000000000000..de3f7d4ab5fa7 --- /dev/null +++ b/libcxx/include/__cstddef/nullptr_t.h @@ -0,0 +1,25 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___CSTDDEF_NULLPTR_T_H +#define _LIBCPP___CSTDDEF_NULLPTR_T_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +using ::nullptr_t; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___CSTDDEF_NULLPTR_T_H diff --git a/libcxx/include/__cstddef/ptrdiff_t.h b/libcxx/include/__cstddef/ptrdiff_t.h new file mode 100644 index 0000000000000..f8b5cdaaff01c --- /dev/null +++ b/libcxx/include/__cstddef/ptrdiff_t.h @@ -0,0 +1,25 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___CSTDDEF_PTRDIFF_T_H +#define _LIBCPP___CSTDDEF_PTRDIFF_T_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +using ::ptrdiff_t _LIBCPP_USING_IF_EXISTS; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___CSTDDEF_PTRDIFF_T_H diff --git a/libcxx/include/__cstddef/size_t.h b/libcxx/include/__cstddef/size_t.h new file mode 100644 index 0000000000000..91abbf0131895 --- /dev/null +++ b/libcxx/include/__cstddef/size_t.h @@ -0,0 +1,25 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___CSTDDEF_SIZE_T_H +#define _LIBCPP___CSTDDEF_SIZE_T_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +using ::size_t _LIBCPP_USING_IF_EXISTS; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___CSTDDEF_SIZE_T_H diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h index feb489f87f62f..4c7970d167ffa 100644 --- a/libcxx/include/__exception/nested_exception.h +++ b/libcxx/include/__exception/nested_exception.h @@ -13,6 +13,8 @@ #include <__exception/exception_ptr.h> #include <__memory/addressof.h> #include <__type_traits/decay.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_base_of.h> #include <__type_traits/is_class.h> #include <__type_traits/is_constructible.h> diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h index 6c6461e727604..794779ae46ab7 100644 --- a/libcxx/include/__fwd/array.h +++ b/libcxx/include/__fwd/array.h @@ -10,7 +10,8 @@ #define _LIBCPP___FWD_ARRAY_H #include <__config> -#include +#include <__cstddef/size_t.h> +#include <__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__fwd/complex.h b/libcxx/include/__fwd/complex.h index 22c78c5cc3c77..092d2e10b12b5 100644 --- a/libcxx/include/__fwd/complex.h +++ b/libcxx/include/__fwd/complex.h @@ -10,7 +10,7 @@ #define _LIBCPP___FWD_COMPLEX_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__fwd/pair.h b/libcxx/include/__fwd/pair.h index af32628fe1e0d..b8ba2b7e92324 100644 --- a/libcxx/include/__fwd/pair.h +++ b/libcxx/include/__fwd/pair.h @@ -10,8 +10,8 @@ #define _LIBCPP___FWD_PAIR_H #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/tuple.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__fwd/span.h b/libcxx/include/__fwd/span.h index 8dafa742c19df..5d473ee51c6b7 100644 --- a/libcxx/include/__fwd/span.h +++ b/libcxx/include/__fwd/span.h @@ -11,7 +11,7 @@ #define _LIBCPP___FWD_SPAN_H #include <__config> -#include +#include <__cstddef/size_t.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__fwd/subrange.h b/libcxx/include/__fwd/subrange.h index 60a41da23dd44..5b3a07e55348a 100644 --- a/libcxx/include/__fwd/subrange.h +++ b/libcxx/include/__fwd/subrange.h @@ -11,8 +11,8 @@ #include <__concepts/copyable.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/concepts.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__fwd/tuple.h b/libcxx/include/__fwd/tuple.h index 902770c29555e..2ed32bc0df4e1 100644 --- a/libcxx/include/__fwd/tuple.h +++ b/libcxx/include/__fwd/tuple.h @@ -10,7 +10,7 @@ #define _LIBCPP___FWD_TUPLE_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h index be2890bee4928..93ea7efd49537 100644 --- a/libcxx/include/__iterator/concepts.h +++ b/libcxx/include/__iterator/concepts.h @@ -34,6 +34,7 @@ #include <__memory/pointer_traits.h> #include <__type_traits/add_pointer.h> #include <__type_traits/common_reference.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_pointer.h> #include <__type_traits/is_primary_template.h> #include <__type_traits/is_reference.h> diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h index 11af9e301842c..4d9ad480cc4a2 100644 --- a/libcxx/include/__iterator/iterator_traits.h +++ b/libcxx/include/__iterator/iterator_traits.h @@ -24,6 +24,8 @@ #include <__type_traits/common_reference.h> #include <__type_traits/conditional.h> #include <__type_traits/disjunction.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_object.h> #include <__type_traits/is_primary_template.h> diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h index 34f8d5f1663b2..549d8ff2dbd7d 100644 --- a/libcxx/include/__iterator/wrap_iter.h +++ b/libcxx/include/__iterator/wrap_iter.h @@ -17,6 +17,7 @@ #include <__memory/addressof.h> #include <__memory/pointer_traits.h> #include <__type_traits/enable_if.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> #include diff --git a/libcxx/include/__mdspan/layout_stride.h b/libcxx/include/__mdspan/layout_stride.h index 704a5a4c1aea5..c57f596431c7c 100644 --- a/libcxx/include/__mdspan/layout_stride.h +++ b/libcxx/include/__mdspan/layout_stride.h @@ -25,6 +25,7 @@ #include <__type_traits/common_type.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_convertible.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/is_same.h> #include <__utility/as_const.h> diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h index 0914aceb318b7..8e08eb74413ee 100644 --- a/libcxx/include/__memory/pointer_traits.h +++ b/libcxx/include/__memory/pointer_traits.h @@ -15,6 +15,7 @@ #include <__type_traits/conditional.h> #include <__type_traits/conjunction.h> #include <__type_traits/decay.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_class.h> #include <__type_traits/is_function.h> #include <__type_traits/is_void.h> diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index d487e4fbe3a95..4dd8022822d22 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -34,6 +34,8 @@ #include <__type_traits/conditional.h> #include <__type_traits/conjunction.h> #include <__type_traits/disjunction.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_array.h> #include <__type_traits/is_bounded_array.h> #include <__type_traits/is_constructible.h> diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h index 7f5e0ea243c95..392cf42137821 100644 --- a/libcxx/include/__memory/unique_ptr.h +++ b/libcxx/include/__memory/unique_ptr.h @@ -23,6 +23,7 @@ #include <__type_traits/common_type.h> #include <__type_traits/conditional.h> #include <__type_traits/dependent_type.h> +#include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_array.h> #include <__type_traits/is_assignable.h> diff --git a/libcxx/include/__memory/uses_allocator.h b/libcxx/include/__memory/uses_allocator.h index 84310c3fa5673..16504e8b2a998 100644 --- a/libcxx/include/__memory/uses_allocator.h +++ b/libcxx/include/__memory/uses_allocator.h @@ -11,6 +11,7 @@ #define _LIBCPP___MEMORY_USES_ALLOCATOR_H #include <__config> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> #include diff --git a/libcxx/include/__random/mersenne_twister_engine.h b/libcxx/include/__random/mersenne_twister_engine.h index 65280d7c5505f..1f50e608ce8d4 100644 --- a/libcxx/include/__random/mersenne_twister_engine.h +++ b/libcxx/include/__random/mersenne_twister_engine.h @@ -13,6 +13,7 @@ #include <__algorithm/min.h> #include <__config> #include <__random/is_seed_sequence.h> +#include <__type_traits/enable_if.h> #include #include #include diff --git a/libcxx/include/__random/seed_seq.h b/libcxx/include/__random/seed_seq.h index 5cf84aeb8a72b..c1a320a75c88a 100644 --- a/libcxx/include/__random/seed_seq.h +++ b/libcxx/include/__random/seed_seq.h @@ -14,6 +14,8 @@ #include <__algorithm/max.h> #include <__config> #include <__iterator/iterator_traits.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_unsigned.h> #include #include diff --git a/libcxx/include/__random/subtract_with_carry_engine.h b/libcxx/include/__random/subtract_with_carry_engine.h index ec25fed49f949..926333cdda45e 100644 --- a/libcxx/include/__random/subtract_with_carry_engine.h +++ b/libcxx/include/__random/subtract_with_carry_engine.h @@ -14,6 +14,7 @@ #include <__config> #include <__random/is_seed_sequence.h> #include <__random/linear_congruential_engine.h> +#include <__type_traits/enable_if.h> #include #include #include diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h index aba584ef93354..144746babb325 100644 --- a/libcxx/include/__ranges/subrange.h +++ b/libcxx/include/__ranges/subrange.h @@ -33,6 +33,7 @@ #include <__tuple/tuple_size.h> #include <__type_traits/conditional.h> #include <__type_traits/decay.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_pointer.h> #include <__type_traits/is_reference.h> #include <__type_traits/make_unsigned.h> diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h index 32fc06e121b36..9b8871e2e71a3 100644 --- a/libcxx/include/__string/constexpr_c_functions.h +++ b/libcxx/include/__string/constexpr_c_functions.h @@ -13,11 +13,13 @@ #include <__memory/addressof.h> #include <__memory/construct_at.h> #include <__type_traits/datasizeof.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_always_bitcastable.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_equality_comparable.h> +#include <__type_traits/is_integral.h> #include <__type_traits/is_same.h> #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/is_trivially_lexicographically_comparable.h> diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h index 18a17fd4d5878..21c9811abeee7 100644 --- a/libcxx/include/__tuple/tuple_size.h +++ b/libcxx/include/__tuple/tuple_size.h @@ -12,6 +12,8 @@ #include <__config> #include <__fwd/tuple.h> #include <__tuple/tuple_types.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_const.h> #include <__type_traits/is_volatile.h> #include diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index 46aae12832f86..49b4e971bbb67 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -10,11 +10,11 @@ #define _LIBCPP___TYPE_TRAITS_ALIGNED_STORAGE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/conditional.h> #include <__type_traits/integral_constant.h> #include <__type_traits/nat.h> #include <__type_traits/type_list.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/aligned_union.h b/libcxx/include/__type_traits/aligned_union.h index 005ed9a096ea8..de62a4b1c2a33 100644 --- a/libcxx/include/__type_traits/aligned_union.h +++ b/libcxx/include/__type_traits/aligned_union.h @@ -10,9 +10,9 @@ #define _LIBCPP___TYPE_TRAITS_ALIGNED_UNION_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/aligned_storage.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/alignment_of.h b/libcxx/include/__type_traits/alignment_of.h index f2d069bf2488f..8871c8ce110d6 100644 --- a/libcxx/include/__type_traits/alignment_of.h +++ b/libcxx/include/__type_traits/alignment_of.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_ALIGNMENT_OF_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/extent.h b/libcxx/include/__type_traits/extent.h index bab03fe997eb6..1c34a4db1c4b5 100644 --- a/libcxx/include/__type_traits/extent.h +++ b/libcxx/include/__type_traits/extent.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_EXTENT_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_allocator.h b/libcxx/include/__type_traits/is_allocator.h index 144ffac4d7ce5..191eeb9a1f522 100644 --- a/libcxx/include/__type_traits/is_allocator.h +++ b/libcxx/include/__type_traits/is_allocator.h @@ -10,10 +10,10 @@ #define _LIBCPP___TYPE_IS_ALLOCATOR_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/void_t.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_array.h b/libcxx/include/__type_traits/is_array.h index dc23de28d2c63..f34204e19ed89 100644 --- a/libcxx/include/__type_traits/is_array.h +++ b/libcxx/include/__type_traits/is_array.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_IS_ARRAY_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_bounded_array.h b/libcxx/include/__type_traits/is_bounded_array.h index 211403d638d08..a78b52e7062b8 100644 --- a/libcxx/include/__type_traits/is_bounded_array.h +++ b/libcxx/include/__type_traits/is_bounded_array.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_nothrow_destructible.h b/libcxx/include/__type_traits/is_nothrow_destructible.h index c2d5ca87232aa..41271a38f3711 100644 --- a/libcxx/include/__type_traits/is_nothrow_destructible.h +++ b/libcxx/include/__type_traits/is_nothrow_destructible.h @@ -10,10 +10,10 @@ #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DESTRUCTIBLE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_destructible.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_null_pointer.h b/libcxx/include/__type_traits/is_null_pointer.h index 9f5697e232684..abc5d142562f1 100644 --- a/libcxx/include/__type_traits/is_null_pointer.h +++ b/libcxx/include/__type_traits/is_null_pointer.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_IS_NULL_POINTER_H #include <__config> +#include <__cstddef/nullptr_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h index 0b817e6509933..f4d687094bbca 100644 --- a/libcxx/include/__type_traits/is_swappable.h +++ b/libcxx/include/__type_traits/is_swappable.h @@ -10,6 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_IS_SWAPPABLE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/add_lvalue_reference.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> @@ -18,7 +19,6 @@ #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/void_t.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/rank.h b/libcxx/include/__type_traits/rank.h index 7f6fad1c54024..aeeedec40dee5 100644 --- a/libcxx/include/__type_traits/rank.h +++ b/libcxx/include/__type_traits/rank.h @@ -10,8 +10,8 @@ #define _LIBCPP___TYPE_TRAITS_RANK_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/remove_all_extents.h b/libcxx/include/__type_traits/remove_all_extents.h index d5373b51f5221..db7dab4a6c132 100644 --- a/libcxx/include/__type_traits/remove_all_extents.h +++ b/libcxx/include/__type_traits/remove_all_extents.h @@ -10,7 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_REMOVE_ALL_EXTENTS_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/remove_extent.h b/libcxx/include/__type_traits/remove_extent.h index fe37b5c7266c6..aceeb47069660 100644 --- a/libcxx/include/__type_traits/remove_extent.h +++ b/libcxx/include/__type_traits/remove_extent.h @@ -10,7 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_REMOVE_EXTENT_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h index 02905707ee37a..0d9ca98958377 100644 --- a/libcxx/include/__type_traits/type_list.h +++ b/libcxx/include/__type_traits/type_list.h @@ -10,7 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_TYPE_LIST_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__utility/in_place.h b/libcxx/include/__utility/in_place.h index fa7a2f4bfd4a9..459b271675261 100644 --- a/libcxx/include/__utility/in_place.h +++ b/libcxx/include/__utility/in_place.h @@ -10,6 +10,7 @@ #define _LIBCPP___UTILITY_IN_PLACE_H #include <__config> +#include <__type_traits/integral_constant.h> #include <__type_traits/remove_cvref.h> #include diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h index 0afbebcdc9f2a..78534a3f399f2 100644 --- a/libcxx/include/__utility/pair.h +++ b/libcxx/include/__utility/pair.h @@ -25,6 +25,7 @@ #include <__type_traits/common_type.h> #include <__type_traits/conditional.h> #include <__type_traits/decay.h> +#include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_constructible.h> diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h index ab88b8e0a0b53..ecfbdec75a2ae 100644 --- a/libcxx/include/__utility/swap.h +++ b/libcxx/include/__utility/swap.h @@ -10,6 +10,7 @@ #define _LIBCPP___UTILITY_SWAP_H #include <__config> +#include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_nothrow_assignable.h> diff --git a/libcxx/include/any b/libcxx/include/any index 5def14dc87e6b..7630e8a057d05 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -90,6 +90,7 @@ namespace std { #include <__type_traits/aligned_storage.h> #include <__type_traits/conditional.h> #include <__type_traits/decay.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_function.h> #include <__type_traits/is_nothrow_constructible.h> diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef index 592f6261a6de3..2b138d9690393 100644 --- a/libcxx/include/cstddef +++ b/libcxx/include/cstddef @@ -34,9 +34,6 @@ Types: */ #include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_integral.h> #include #include @@ -53,80 +50,10 @@ Types: # pragma GCC system_header #endif -_LIBCPP_BEGIN_NAMESPACE_STD - -using ::nullptr_t; -using ::ptrdiff_t _LIBCPP_USING_IF_EXISTS; -using ::size_t _LIBCPP_USING_IF_EXISTS; - -#if !defined(_LIBCPP_CXX03_LANG) -using ::max_align_t _LIBCPP_USING_IF_EXISTS; -#endif - -_LIBCPP_END_NAMESPACE_STD - -#if _LIBCPP_STD_VER >= 17 -namespace std { // purposefully not versioned - -enum class byte : unsigned char {}; - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept { - return static_cast( - static_cast(static_cast(__lhs) | static_cast(__rhs))); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept { - return __lhs = __lhs | __rhs; -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator&(byte __lhs, byte __rhs) noexcept { - return static_cast( - static_cast(static_cast(__lhs) & static_cast(__rhs))); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept { - return __lhs = __lhs & __rhs; -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator^(byte __lhs, byte __rhs) noexcept { - return static_cast( - static_cast(static_cast(__lhs) ^ static_cast(__rhs))); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept { - return __lhs = __lhs ^ __rhs; -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator~(byte __b) noexcept { - return static_cast(static_cast(~static_cast(__b))); -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr byte& operator<<=(byte& __lhs, _Integer __shift) noexcept { - return __lhs = __lhs << __shift; -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr byte operator<<(byte __lhs, _Integer __shift) noexcept { - return static_cast(static_cast(static_cast(__lhs) << __shift)); -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr byte& operator>>=(byte& __lhs, _Integer __shift) noexcept { - return __lhs = __lhs >> __shift; -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr byte operator>>(byte __lhs, _Integer __shift) noexcept { - return static_cast(static_cast(static_cast(__lhs) >> __shift)); -} - -template ::value, int> = 0> -[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Integer to_integer(byte __b) noexcept { - return static_cast<_Integer>(__b); -} - -} // namespace std -#endif // _LIBCPP_STD_VER >= 17 +#include <__cstddef/byte.h> +#include <__cstddef/max_align_t.h> +#include <__cstddef/nullptr_t.h> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #endif // _LIBCPP_CSTDDEF diff --git a/libcxx/include/cstdio b/libcxx/include/cstdio index 7f94371081f8b..a461c24dcc019 100644 --- a/libcxx/include/cstdio +++ b/libcxx/include/cstdio @@ -96,6 +96,7 @@ void perror(const char* s); */ #include <__config> +#include <__cstddef/size_t.h> #include @@ -115,7 +116,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD using ::FILE _LIBCPP_USING_IF_EXISTS; using ::fpos_t _LIBCPP_USING_IF_EXISTS; -using ::size_t _LIBCPP_USING_IF_EXISTS; using ::fclose _LIBCPP_USING_IF_EXISTS; using ::fflush _LIBCPP_USING_IF_EXISTS; diff --git a/libcxx/include/cstdlib b/libcxx/include/cstdlib index c817fd8f4accd..1ecdd3a6d0328 100644 --- a/libcxx/include/cstdlib +++ b/libcxx/include/cstdlib @@ -82,6 +82,7 @@ void *aligned_alloc(size_t alignment, size_t size); // C11 */ #include <__config> +#include <__cstddef/size_t.h> #include @@ -99,7 +100,6 @@ void *aligned_alloc(size_t alignment, size_t size); // C11 _LIBCPP_BEGIN_NAMESPACE_STD -using ::size_t _LIBCPP_USING_IF_EXISTS; using ::div_t _LIBCPP_USING_IF_EXISTS; using ::ldiv_t _LIBCPP_USING_IF_EXISTS; using ::lldiv_t _LIBCPP_USING_IF_EXISTS; diff --git a/libcxx/include/cstring b/libcxx/include/cstring index c2c92b02e73cc..5bb6e3e10628f 100644 --- a/libcxx/include/cstring +++ b/libcxx/include/cstring @@ -57,6 +57,7 @@ size_t strlen(const char* s); */ #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_constant_evaluated.h> #include @@ -75,7 +76,6 @@ size_t strlen(const char* s); _LIBCPP_BEGIN_NAMESPACE_STD -using ::size_t _LIBCPP_USING_IF_EXISTS; using ::memcpy _LIBCPP_USING_IF_EXISTS; using ::memmove _LIBCPP_USING_IF_EXISTS; using ::strcpy _LIBCPP_USING_IF_EXISTS; diff --git a/libcxx/include/ctime b/libcxx/include/ctime index f47b49a43e23e..d2d2470f2fa60 100644 --- a/libcxx/include/ctime +++ b/libcxx/include/ctime @@ -46,6 +46,7 @@ int timespec_get( struct timespec *ts, int base); // C++17 */ #include <__config> +#include <__cstddef/size_t.h> // is not provided by libc++ #if __has_include() @@ -62,7 +63,6 @@ int timespec_get( struct timespec *ts, int base); // C++17 _LIBCPP_BEGIN_NAMESPACE_STD using ::clock_t _LIBCPP_USING_IF_EXISTS; -using ::size_t _LIBCPP_USING_IF_EXISTS; using ::time_t _LIBCPP_USING_IF_EXISTS; using ::tm _LIBCPP_USING_IF_EXISTS; #if _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/cuchar b/libcxx/include/cuchar index f0015be275367..bfc26f03aaf78 100644 --- a/libcxx/include/cuchar +++ b/libcxx/include/cuchar @@ -37,6 +37,7 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps); */ #include <__config> +#include <__cstddef/size_t.h> #include @@ -57,7 +58,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if !defined(_LIBCPP_CXX03_LANG) using ::mbstate_t _LIBCPP_USING_IF_EXISTS; -using ::size_t _LIBCPP_USING_IF_EXISTS; # if !defined(_LIBCPP_HAS_NO_C8RTOMB_MBRTOC8) using ::mbrtoc8 _LIBCPP_USING_IF_EXISTS; diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar index 08cfac58c846a..f5a26664c1c3e 100644 --- a/libcxx/include/cwchar +++ b/libcxx/include/cwchar @@ -103,6 +103,7 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len, */ #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/copy_cv.h> #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_equality_comparable.h> @@ -127,7 +128,6 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len, _LIBCPP_BEGIN_NAMESPACE_STD using ::mbstate_t _LIBCPP_USING_IF_EXISTS; -using ::size_t _LIBCPP_USING_IF_EXISTS; using ::tm _LIBCPP_USING_IF_EXISTS; using ::wint_t _LIBCPP_USING_IF_EXISTS; using ::FILE _LIBCPP_USING_IF_EXISTS; diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h index 39f60d64f7b41..c60c08b0ea459 100644 --- a/libcxx/include/experimental/__simd/reference.h +++ b/libcxx/include/experimental/__simd/reference.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H +#include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_same.h> #include <__utility/declval.h> diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h index 1add4653209ac..a2aeeb5cd0f54 100644 --- a/libcxx/include/experimental/__simd/scalar.h +++ b/libcxx/include/experimental/__simd/scalar.h @@ -11,6 +11,7 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H #include <__assert> +#include <__type_traits/integral_constant.h> #include #include #include diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h index 37e334aad6da0..db0f9b39d9600 100644 --- a/libcxx/include/experimental/__simd/simd.h +++ b/libcxx/include/experimental/__simd/simd.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H +#include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_cvref.h> #include <__utility/forward.h> diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h index fd6dee2e28ee9..d54d4898b718a 100644 --- a/libcxx/include/experimental/__simd/simd_mask.h +++ b/libcxx/include/experimental/__simd/simd_mask.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H +#include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include #include diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h index 316866b84873d..5787f237bb01e 100644 --- a/libcxx/include/experimental/__simd/vec_ext.h +++ b/libcxx/include/experimental/__simd/vec_ext.h @@ -12,6 +12,7 @@ #include <__assert> #include <__bit/bit_ceil.h> +#include <__type_traits/integral_constant.h> #include <__utility/forward.h> #include <__utility/integer_sequence.h> #include diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 7cde21417561e..3abc11723a5a9 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -440,6 +440,11 @@ module std_cstdbool [system] { } module std_cstddef [system] { header "cstddef" + module byte { header "__cstddef/byte.h" } + module max_align_t { header "__cstddef/max_align_t.h" } + module nullptr_t { header "__cstddef/nullptr_t.h" } + module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } + module size_t { header "__cstddef/size_t.h" } export * } module std_cstdint [system] { @@ -1418,6 +1423,7 @@ module std_private_iterator_iterator [system] { header "__iterato module std_private_iterator_iterator_traits [system] { header "__iterator/iterator_traits.h" export std_private_type_traits_is_primary_template + export std_private_type_traits_integral_constant } module std_private_iterator_iterator_with_data [system] { header "__iterator/iterator_with_data.h" } module std_private_iterator_mergeable [system] { @@ -1814,7 +1820,10 @@ module std_private_tuple_tuple_like [system] { export * } module std_private_tuple_tuple_like_ext [system] { header "__tuple/tuple_like_ext.h" } -module std_private_tuple_tuple_size [system] { header "__tuple/tuple_size.h" } +module std_private_tuple_tuple_size [system] { + header "__tuple/tuple_size.h" + export std_private_type_traits_integral_constant +} module std_private_tuple_tuple_types [system] { header "__tuple/tuple_types.h" } module std_private_type_traits_add_const [system] { header "__type_traits/add_const.h" } @@ -1926,7 +1935,10 @@ module std_private_type_traits_is_implicitly_default_constructible [system header "__type_traits/is_implicitly_default_constructible.h" export std_private_type_traits_integral_constant } -module std_private_type_traits_is_integral [system] { header "__type_traits/is_integral.h" } +module std_private_type_traits_is_integral [system] { + header "__type_traits/is_integral.h" + export std_private_type_traits_integral_constant +} module std_private_type_traits_is_literal_type [system] { header "__type_traits/is_literal_type.h" } module std_private_type_traits_is_member_pointer [system] { header "__type_traits/is_member_pointer.h" } module std_private_type_traits_is_nothrow_assignable [system] { header "__type_traits/is_nothrow_assignable.h" } @@ -1954,7 +1966,10 @@ module std_private_type_traits_is_primary_template [system header "__type_traits/is_primary_template.h" export std_private_type_traits_enable_if } -module std_private_type_traits_is_reference [system] { header "__type_traits/is_reference.h" } +module std_private_type_traits_is_reference [system] { + header "__type_traits/is_reference.h" + export std_private_type_traits_integral_constant +} module std_private_type_traits_is_reference_wrapper [system] { header "__type_traits/is_reference_wrapper.h" } module std_private_type_traits_is_referenceable [system] { header "__type_traits/is_referenceable.h" } module std_private_type_traits_is_same [system] { @@ -1976,7 +1991,10 @@ module std_private_type_traits_is_swappable [system module std_private_type_traits_is_trivial [system] { header "__type_traits/is_trivial.h" } module std_private_type_traits_is_trivially_assignable [system] { header "__type_traits/is_trivially_assignable.h" } module std_private_type_traits_is_trivially_constructible [system] { header "__type_traits/is_trivially_constructible.h" } -module std_private_type_traits_is_trivially_copyable [system] { header "__type_traits/is_trivially_copyable.h" } +module std_private_type_traits_is_trivially_copyable [system] { + header "__type_traits/is_trivially_copyable.h" + export std_private_type_traits_integral_constant +} module std_private_type_traits_is_trivially_destructible [system] { header "__type_traits/is_trivially_destructible.h" } module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" } module std_private_type_traits_is_trivially_relocatable [system] { header "__type_traits/is_trivially_relocatable.h" } @@ -2044,7 +2062,10 @@ module std_private_utility_exception_guard [system] { header "__utility/e module std_private_utility_exchange [system] { header "__utility/exchange.h" } module std_private_utility_forward [system] { header "__utility/forward.h" } module std_private_utility_forward_like [system] { header "__utility/forward_like.h" } -module std_private_utility_in_place [system] { header "__utility/in_place.h" } +module std_private_utility_in_place [system] { + header "__utility/in_place.h" + export std_private_type_traits_integral_constant +} module std_private_utility_integer_sequence [system] { header "__utility/integer_sequence.h" } module std_private_utility_is_pointer_in_range [system] { header "__utility/is_pointer_in_range.h" } module std_private_utility_is_valid_range [system] { header "__utility/is_valid_range.h" } diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 2727cad02fa99..54e0b4cf5d634 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -58,6 +58,7 @@ public: #include <__config> #include <__exception/exception.h> +#include <__type_traits/integral_constant.h> #include <__type_traits/is_constant_evaluated.h> #include <__verbose_abort> #include diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map index acf3485e644af..69314ba756319 100644 --- a/libcxx/include/unordered_map +++ b/libcxx/include/unordered_map @@ -600,7 +600,9 @@ template #include <__ranges/concepts.h> #include <__ranges/container_compatible_range.h> #include <__ranges/from_range.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_allocator.h> +#include <__type_traits/is_integral.h> #include <__type_traits/type_identity.h> #include <__utility/forward.h> #include diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set index d11ceacba8143..fb50f78a02941 100644 --- a/libcxx/include/unordered_set +++ b/libcxx/include/unordered_set @@ -548,7 +548,9 @@ template #include <__ranges/concepts.h> #include <__ranges/container_compatible_range.h> #include <__ranges/from_range.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_allocator.h> +#include <__type_traits/is_integral.h> #include <__utility/forward.h> #include diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index 3bf39ea17c912..51f38ea086ac0 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -231,7 +231,6 @@ expected cstddef expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iosfwd experimental/iterator iterator experimental/iterator type_traits @@ -918,7 +917,6 @@ tuple type_traits tuple typeinfo tuple utility tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index 49125486cfcf6..f7b0179f6f60e 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -232,7 +232,6 @@ expected cstddef expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iosfwd experimental/iterator iterator experimental/iterator type_traits @@ -925,7 +924,6 @@ tuple type_traits tuple typeinfo tuple utility tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index 28dfb320fe06c..11afb76583a8c 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -233,7 +233,6 @@ expected cstddef expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iosfwd experimental/iterator iterator experimental/iterator type_traits @@ -928,7 +927,6 @@ tuple type_traits tuple typeinfo tuple utility tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index 5b7b6cecf73f8..42ba4ef3f6f52 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -233,7 +233,6 @@ expected cstddef expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iosfwd experimental/iterator iterator experimental/iterator type_traits @@ -929,7 +928,6 @@ tuple type_traits tuple typeinfo tuple utility tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 84ea6433fb12d..cd48b37520f15 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -241,7 +241,6 @@ expected cstddef expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iosfwd experimental/iterator iterator experimental/iterator type_traits @@ -936,7 +935,6 @@ tuple type_traits tuple typeinfo tuple utility tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 946ba486294d3..db09568fc76ff 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -155,7 +155,6 @@ expected cstdint expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iterator experimental/memory cstddef experimental/memory cstdint @@ -647,7 +646,6 @@ tuple compare tuple cstddef tuple cstdint tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 946ba486294d3..db09568fc76ff 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -155,7 +155,6 @@ expected cstdint expected initializer_list expected new expected version -experimental/iterator cstddef experimental/iterator iterator experimental/memory cstddef experimental/memory cstdint @@ -647,7 +646,6 @@ tuple compare tuple cstddef tuple cstdint tuple version -type_traits cstddef type_traits cstdint type_traits version typeindex compare diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp index 5e37db95ab090..1ab1aa60ab826 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp index 38a46fcc12227..f243e9d1f63b5 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp index baf2032a4a1f0..e6a0cfb9d11e0 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp index 81dca301e21fb..a1088218a35f0 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp index 92268cf563b47..82931162b4f39 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp index 9d5d361662e8c..20e0eff91b253 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp index 63b60640ac048..1fedc4f8a5386 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp index 1df1d883a12e1..4221b86fe1cc6 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp index 588c5e0cf7af2..a7c5c43a4e2c2 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp index 16eabbd2a5a4d..72e412772fb08 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp index 53405588266f7..fc0fff60394e3 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp index 9d5d37abde958..897da8b769e6a 100644 --- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp +++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp @@ -122,7 +122,9 @@ void header_exportable_declarations::registerMatchers(clang::ast_matchers::Match [[fallthrough]]; case FileType::ModulePartition: case FileType::CompatModulePartition: - finder->addMatcher(namedDecl(isExpansionInFileMatching(filename_)).bind("header_exportable_declarations"), this); + finder->addMatcher(namedDecl(anyOf(isExpansionInFileMatching(filename_), isExpansionInFileMatching(extra_header_))) + .bind("header_exportable_declarations"), + this); break; case FileType::Module: case FileType::CompatModule: diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py index b7758dc9a41ee..91933d4f425bd 100644 --- a/libcxx/utils/libcxx/test/modules.py +++ b/libcxx/utils/libcxx/test/modules.py @@ -99,6 +99,10 @@ # same definition. ExtraHeader["functional"] = "v1/__compare/compare_three_way.h$" +# Some C compatibility headers define std::size_t, which is in <__cstddef/size_t.h> +for header in ("cstdio", "cstdlib", "cstring", "ctime", "cuchar", "cwchar"): + ExtraHeader[header] = "v1/__cstddef/size_t.h$" + # newline needs to be escaped for the module partition output. nl = "\\\\n" From 485d191f0ca5e31a60fe2489ac99270ed5c7a594 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 5 Sep 2024 12:29:55 +0000 Subject: [PATCH 216/425] [gn build] Port d6832a611a7c --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 39ee220ee3a72..371ca7fc7a37e 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -396,6 +396,11 @@ if (current_toolchain == default_toolchain) { "__coroutine/coroutine_traits.h", "__coroutine/noop_coroutine_handle.h", "__coroutine/trivial_awaitables.h", + "__cstddef/byte.h", + "__cstddef/max_align_t.h", + "__cstddef/nullptr_t.h", + "__cstddef/ptrdiff_t.h", + "__cstddef/size_t.h", "__debug_utils/randomize_range.h", "__debug_utils/sanitizers.h", "__debug_utils/strict_weak_ordering_check.h", From 2a07509c8d3c8b5b2c88e4f73dde0071bf506870 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 5 Sep 2024 13:42:59 +0100 Subject: [PATCH 217/425] [Clang] Add __builtin_is_within_lifetime to implement P2641R4's std::is_within_lifetime (#91895) [P2641R4](https://wg21.link/P2641R4) This new builtin function is declared `consteval`. Support for `-fexperimental-new-constant-interpreter` will be added in a later patch. --------- Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/Basic/Builtins.td | 6 + .../include/clang/Basic/DiagnosticASTKinds.td | 12 +- .../clang/Basic/DiagnosticSemaKinds.td | 4 + clang/lib/AST/ByteCode/State.h | 1 + clang/lib/AST/ExprConstant.cpp | 109 ++++- clang/lib/CodeGen/CGBuiltin.cpp | 3 + clang/lib/Sema/SemaChecking.cpp | 40 ++ clang/lib/Sema/SemaExpr.cpp | 3 +- .../SemaCXX/builtin-is-within-lifetime.cpp | 431 ++++++++++++++++++ clang/test/SemaCXX/consteval-builtin.cpp | 93 ++++ 11 files changed, 698 insertions(+), 7 deletions(-) create mode 100644 clang/test/SemaCXX/builtin-is-within-lifetime.cpp create mode 100644 clang/test/SemaCXX/consteval-builtin.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index dc103aceebc36..ab3c3e6049f60 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -122,6 +122,9 @@ C++2c Feature Support - Implemented `P2747R2 constexpr placement new `_. +- Added the ``__builtin_is_within_lifetime`` builtin, which supports + `P2641R4 Checking if a union alternative is active `_ + C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ - Removed the restriction to literal types in constexpr functions in C++23 mode. diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 9e2a590f265ac..92118418d9d45 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -934,6 +934,12 @@ def IsConstantEvaluated : LangBuiltin<"CXX_LANG"> { let Prototype = "bool()"; } +def IsWithinLifetime : LangBuiltin<"CXX_LANG"> { + let Spellings = ["__builtin_is_within_lifetime"]; + let Attributes = [NoThrow, CustomTypeChecking, Consteval]; + let Prototype = "bool(void*)"; +} + // GCC exception builtins def EHReturn : Builtin { let Spellings = ["__builtin_eh_return"]; diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td index 91135b18c7571..21a307d1e8987 100644 --- a/clang/include/clang/Basic/DiagnosticASTKinds.td +++ b/clang/include/clang/Basic/DiagnosticASTKinds.td @@ -167,14 +167,14 @@ def note_constexpr_this : Note< def access_kind : TextSubstitution< "%select{read of|read of|assignment to|increment of|decrement of|" "member call on|dynamic_cast of|typeid applied to|construction of|" - "destruction of}0">; + "destruction of|read of}0">; def access_kind_subobject : TextSubstitution< "%select{read of|read of|assignment to|increment of|decrement of|" "member call on|dynamic_cast of|typeid applied to|" - "construction of subobject of|destruction of}0">; + "construction of subobject of|destruction of|read of}0">; def access_kind_volatile : TextSubstitution< "%select{read of|read of|assignment to|increment of|decrement of|" - "||||}0">; + "|||||}0">; def note_constexpr_lifetime_ended : Note< "%sub{access_kind}0 %select{temporary|variable}1 whose " "%plural{8:storage duration|:lifetime}0 has ended">; @@ -407,6 +407,12 @@ def warn_is_constant_evaluated_always_true_constexpr : Warning< "'%0' will always evaluate to 'true' in a manifestly constant-evaluated expression">, InGroup>; +def err_invalid_is_within_lifetime : Note< + "'%0' cannot be called with " + "%select{a null pointer|a one-past-the-end pointer|" + "a pointer to an object whose lifetime has not yet begun}1" +>; + // inline asm related. let CategoryName = "Inline Assembly Issue" in { def err_asm_invalid_escape : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dcb49d8a67604..72ea5338ce615 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12186,6 +12186,10 @@ def err_builtin_launder_invalid_arg : Error< "%select{non-pointer|function pointer|void pointer}0 argument to " "'__builtin_launder' is not allowed">; +def err_builtin_is_within_lifetime_invalid_arg : Error< + "%select{non-|function }0pointer argument to '__builtin_is_within_lifetime' " + "is not allowed">; + def err_builtin_invalid_arg_type: Error < "%ordinal0 argument must be " "%select{a vector, integer or floating point type|a matrix|" diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h index 2cffce4bc2ae4..3248e2d8be697 100644 --- a/clang/lib/AST/ByteCode/State.h +++ b/clang/lib/AST/ByteCode/State.h @@ -34,6 +34,7 @@ enum AccessKinds { AK_TypeId, AK_Construct, AK_Destroy, + AK_IsWithinLifetime, }; /// The order of this enum is important for diagnostics. diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 205cbdf52a6f7..0ad3577d4e102 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -1522,7 +1522,8 @@ CallStackFrame::~CallStackFrame() { } static bool isRead(AccessKinds AK) { - return AK == AK_Read || AK == AK_ReadObjectRepresentation; + return AK == AK_Read || AK == AK_ReadObjectRepresentation || + AK == AK_IsWithinLifetime; } static bool isModification(AccessKinds AK) { @@ -1532,6 +1533,7 @@ static bool isModification(AccessKinds AK) { case AK_MemberCall: case AK_DynamicCast: case AK_TypeId: + case AK_IsWithinLifetime: return false; case AK_Assign: case AK_Increment: @@ -1549,7 +1551,8 @@ static bool isAnyAccess(AccessKinds AK) { /// Is this an access per the C++ definition? static bool isFormalAccess(AccessKinds AK) { - return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy; + return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy && + AK != AK_IsWithinLifetime; } /// Is this kind of axcess valid on an indeterminate object value? @@ -1561,6 +1564,7 @@ static bool isValidIndeterminateAccess(AccessKinds AK) { // These need the object's value. return false; + case AK_IsWithinLifetime: case AK_ReadObjectRepresentation: case AK_Assign: case AK_Construct: @@ -3707,7 +3711,8 @@ struct CompleteObject { // In C++14 onwards, it is permitted to read a mutable member whose // lifetime began within the evaluation. // FIXME: Should we also allow this in C++11? - if (!Info.getLangOpts().CPlusPlus14) + if (!Info.getLangOpts().CPlusPlus14 && + AK != AccessKinds::AK_IsWithinLifetime) return false; return lifetimeStartedInEvaluation(Info, Base, /*MutableSubobject*/true); } @@ -3760,6 +3765,12 @@ findSubobject(EvalInfo &Info, const Expr *E, const CompleteObject &Obj, if ((O->isAbsent() && !(handler.AccessKind == AK_Construct && I == N)) || (O->isIndeterminate() && !isValidIndeterminateAccess(handler.AccessKind))) { + // Object has ended lifetime. + // If I is non-zero, some subobject (member or array element) of a + // complete object has ended its lifetime, so this is valid for + // IsWithinLifetime, resulting in false. + if (I != 0 && handler.AccessKind == AK_IsWithinLifetime) + return false; if (!Info.checkingPotentialConstantExpression()) Info.FFDiag(E, diag::note_constexpr_access_uninit) << handler.AccessKind << O->isIndeterminate() @@ -3927,6 +3938,9 @@ findSubobject(EvalInfo &Info, const Expr *E, const CompleteObject &Obj, // Placement new onto an inactive union member makes it active. O->setUnion(Field, APValue()); } else { + // Pointer to/into inactive union member: Not within lifetime + if (handler.AccessKind == AK_IsWithinLifetime) + return false; // FIXME: If O->getUnionValue() is absent, report that there's no // active union member rather than reporting the prior active union // member. We'll need to fix nullptr_t to not use APValue() as its @@ -11684,6 +11698,9 @@ class IntExprEvaluator bool ZeroInitialization(const Expr *E) { return Success(0, E); } + friend std::optional EvaluateBuiltinIsWithinLifetime(IntExprEvaluator &, + const CallExpr *); + //===--------------------------------------------------------------------===// // Visitor Methods //===--------------------------------------------------------------------===// @@ -12743,6 +12760,11 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, return Success(Info.InConstantContext, E); } + case Builtin::BI__builtin_is_within_lifetime: + if (auto result = EvaluateBuiltinIsWithinLifetime(*this, E)) + return Success(*result, E); + return false; + case Builtin::BI__builtin_ctz: case Builtin::BI__builtin_ctzl: case Builtin::BI__builtin_ctzll: @@ -17322,3 +17344,84 @@ bool Expr::tryEvaluateStrLen(uint64_t &Result, ASTContext &Ctx) const { EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold); return EvaluateBuiltinStrLen(this, Result, Info); } + +namespace { +struct IsWithinLifetimeHandler { + EvalInfo &Info; + static constexpr AccessKinds AccessKind = AccessKinds::AK_IsWithinLifetime; + using result_type = std::optional; + std::optional failed() { return std::nullopt; } + template + std::optional found(T &Subobj, QualType SubobjType) { + return true; + } +}; + +std::optional EvaluateBuiltinIsWithinLifetime(IntExprEvaluator &IEE, + const CallExpr *E) { + EvalInfo &Info = IEE.Info; + // Sometimes this is called during some sorts of constant folding / early + // evaluation. These are meant for non-constant expressions and are not + // necessary since this consteval builtin will never be evaluated at runtime. + // Just fail to evaluate when not in a constant context. + if (!Info.InConstantContext) + return std::nullopt; + assert(E->getBuiltinCallee() == Builtin::BI__builtin_is_within_lifetime); + const Expr *Arg = E->getArg(0); + if (Arg->isValueDependent()) + return std::nullopt; + LValue Val; + if (!EvaluatePointer(Arg, Val, Info)) + return std::nullopt; + + auto Error = [&](int Diag) { + bool CalledFromStd = false; + const auto *Callee = Info.CurrentCall->getCallee(); + if (Callee && Callee->isInStdNamespace()) { + const IdentifierInfo *Identifier = Callee->getIdentifier(); + CalledFromStd = Identifier && Identifier->isStr("is_within_lifetime"); + } + Info.CCEDiag(CalledFromStd ? Info.CurrentCall->getCallRange().getBegin() + : E->getExprLoc(), + diag::err_invalid_is_within_lifetime) + << (CalledFromStd ? "std::is_within_lifetime" + : "__builtin_is_within_lifetime") + << Diag; + return std::nullopt; + }; + // C++2c [meta.const.eval]p4: + // During the evaluation of an expression E as a core constant expression, a + // call to this function is ill-formed unless p points to an object that is + // usable in constant expressions or whose complete object's lifetime began + // within E. + + // Make sure it points to an object + // nullptr does not point to an object + if (Val.isNullPointer() || Val.getLValueBase().isNull()) + return Error(0); + QualType T = Val.getLValueBase().getType(); + assert(!T->isFunctionType() && + "Pointers to functions should have been typed as function pointers " + "which would have been rejected earlier"); + assert(T->isObjectType()); + // Hypothetical array element is not an object + if (Val.getLValueDesignator().isOnePastTheEnd()) + return Error(1); + assert(Val.getLValueDesignator().isValidSubobject() && + "Unchecked case for valid subobject"); + // All other ill-formed values should have failed EvaluatePointer, so the + // object should be a pointer to an object that is usable in a constant + // expression or whose complete lifetime began within the expression + CompleteObject CO = + findCompleteObject(Info, E, AccessKinds::AK_IsWithinLifetime, Val, T); + // The lifetime hasn't begun yet if we are still evaluating the + // initializer ([basic.life]p(1.2)) + if (Info.EvaluatingDeclValue && CO.Value == Info.EvaluatingDeclValue) + return Error(2); + + if (!CO) + return false; + IsWithinLifetimeHandler handler{Info}; + return findSubobject(Info, E, CO, Val.getLValueDesignator(), handler); +} +} // namespace diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e826c1c6fbbd2..02d8726baa421 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2538,6 +2538,9 @@ static RValue EmitHipStdParUnsupportedBuiltin(CodeGenFunction *CGF, RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue) { + assert(!getContext().BuiltinInfo.isImmediate(BuiltinID) && + "Should not codegen for consteval builtins"); + const FunctionDecl *FD = GD.getDecl()->getAsFunction(); // See if we can constant fold this builtin. If so, don't emit it at all. // TODO: Extend this handling to all builtin calls that we can constant-fold. diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index b01765b6833a1..2aab52160afa7 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1844,6 +1844,44 @@ static ExprResult BuiltinLaunder(Sema &S, CallExpr *TheCall) { return TheCall; } +static ExprResult BuiltinIsWithinLifetime(Sema &S, CallExpr *TheCall) { + if (S.checkArgCount(TheCall, 1)) + return ExprError(); + + ExprResult Arg = S.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0)); + if (Arg.isInvalid()) + return ExprError(); + QualType ParamTy = Arg.get()->getType(); + TheCall->setArg(0, Arg.get()); + TheCall->setType(S.Context.BoolTy); + + // Only accept pointers to objects as arguments, which should have object + // pointer or void pointer types. + if (const auto *PT = ParamTy->getAs()) { + // LWG4138: Function pointer types not allowed + if (PT->getPointeeType()->isFunctionType()) { + S.Diag(TheCall->getArg(0)->getExprLoc(), + diag::err_builtin_is_within_lifetime_invalid_arg) + << 1; + return ExprError(); + } + // Disallow VLAs too since those shouldn't be able to + // be a template parameter for `std::is_within_lifetime` + if (PT->getPointeeType()->isVariableArrayType()) { + S.Diag(TheCall->getArg(0)->getExprLoc(), diag::err_vla_unsupported) + << 1 << "__builtin_is_within_lifetime"; + return ExprError(); + } + } else { + S.Diag(TheCall->getArg(0)->getExprLoc(), + diag::err_builtin_is_within_lifetime_invalid_arg) + << 0; + return ExprError(); + } + + return TheCall; +} + // Emit an error and return true if the current object format type is in the // list of unsupported types. static bool CheckBuiltinTargetNotInUnsupported( @@ -2276,6 +2314,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, } case Builtin::BI__builtin_launder: return BuiltinLaunder(*this, TheCall); + case Builtin::BI__builtin_is_within_lifetime: + return BuiltinIsWithinLifetime(*this, TheCall); case Builtin::BI__sync_fetch_and_add: case Builtin::BI__sync_fetch_and_add_1: case Builtin::BI__sync_fetch_and_add_2: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e291ef6c97eef..32dac4440fb82 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -17622,7 +17622,8 @@ HandleImmediateInvocations(Sema &SemaRef, (SemaRef.inTemplateInstantiation() && !ImmediateEscalating)) { SemaRef.Diag(DR->getBeginLoc(), diag::err_invalid_consteval_take_address) << ND << isa(ND) << FD->isConsteval(); - SemaRef.Diag(ND->getLocation(), diag::note_declared_at); + if (!FD->getBuiltinID()) + SemaRef.Diag(ND->getLocation(), diag::note_declared_at); if (auto Context = SemaRef.InnermostDeclarationWithDelayedImmediateInvocations()) { SemaRef.Diag(Context->Loc, diag::note_invalid_consteval_initializer) diff --git a/clang/test/SemaCXX/builtin-is-within-lifetime.cpp b/clang/test/SemaCXX/builtin-is-within-lifetime.cpp new file mode 100644 index 0000000000000..62ff2681952ce --- /dev/null +++ b/clang/test/SemaCXX/builtin-is-within-lifetime.cpp @@ -0,0 +1,431 @@ +// RUN: %clang_cc1 -std=c++20 -Wno-unused %s -verify=expected,cxx20 -Wno-vla-cxx-extension +// RUN: %clang_cc1 -std=c++23 -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension +// RUN: %clang_cc1 -std=c++26 -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension +// RUN: %clang_cc1 -std=c++26 -DINLINE_NAMESPACE -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension + +inline constexpr void* operator new(__SIZE_TYPE__, void* p) noexcept { return p; } +namespace std { +template +constexpr T* construct_at(T* p, Args&&... args) { return ::new((void*)p) T(static_cast(args)...); } +template +constexpr void destroy_at(T* p) { p->~T(); } +template +struct allocator { + constexpr T* allocate(__SIZE_TYPE__ n) { return static_cast(::operator new(n * sizeof(T))); } + constexpr void deallocate(T* p, __SIZE_TYPE__) { ::operator delete(p); } +}; +using nullptr_t = decltype(nullptr); +template +struct integral_constant { static constexpr T value = v; }; +template +using bool_constant = integral_constant; +using true_type = bool_constant; +using false_type = bool_constant; +template +inline constexpr bool is_function_v = __is_function(T); +#ifdef INLINE_NAMESPACE +inline namespace __1 { +#endif +template requires (!is_function_v) // #std-constraint +consteval bool is_within_lifetime(const T* p) noexcept { // #std-definition + return __builtin_is_within_lifetime(p); +} +#ifdef INLINE_NAMESPACE +} +#endif +} + +consteval bool test_union(int& i, char& c) { + if (__builtin_is_within_lifetime(&i) || __builtin_is_within_lifetime(&c)) + return false; + std::construct_at(&c, 1); + if (__builtin_is_within_lifetime(&i) || !__builtin_is_within_lifetime(&c)) + return false; + std::construct_at(&i, 3); + if (!__builtin_is_within_lifetime(&i) || __builtin_is_within_lifetime(&c)) + return false; + return true; +} + +static_assert([]{ + union { int i; char c; } u; + return test_union(u.i, u.c); +}()); +static_assert([]{ + union { int i; char c; }; + return test_union(i, c); +}()); +static_assert([]{ + struct { union { int i; char c; }; } u; + return test_union(u.i, u.c); +}()); +static_assert([]{ + struct { union { int i; char c; } u; } r; + return test_union(r.u.i, r.u.c); +}()); + +consteval bool test_nested() { + union { + union { int i; char c; } u; + long l; + }; + if (__builtin_is_within_lifetime(&l) || __builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c)) + return false; + std::construct_at(&l); + if (!__builtin_is_within_lifetime(&l) || __builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c)) + return false; + std::construct_at(&u); + std::construct_at(&u.i); + if (__builtin_is_within_lifetime(&l) || !__builtin_is_within_lifetime(&u) || !__builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c)) + return false; + std::construct_at(&u.c); + if (__builtin_is_within_lifetime(&l) || !__builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || !__builtin_is_within_lifetime(&u.c)) + return false; + return true; +} +static_assert(test_nested()); + +consteval bool test_dynamic(bool read_after_deallocate) { + std::allocator a; + int* p = a.allocate(1); + // a.allocate starts the lifetime of an array, + // the complete object of *p has started its lifetime + if (__builtin_is_within_lifetime(p)) + return false; + std::construct_at(p); + if (!__builtin_is_within_lifetime(p)) + return false; + std::destroy_at(p); + if (__builtin_is_within_lifetime(p)) + return false; + a.deallocate(p, 1); + if (read_after_deallocate) + __builtin_is_within_lifetime(p); // expected-note {{read of heap allocated object that has been deleted}} + return true; +} +static_assert(test_dynamic(false)); +static_assert(test_dynamic(true)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{in call to 'test_dynamic(true)'}} + +consteval bool test_automatic(int read_dangling) { + int* p; + { + int x = 0; + p = &x; + if (!__builtin_is_within_lifetime(p)) + return false; + } + { + int x = 0; + if (read_dangling == 1) + __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}} + } + if (read_dangling == 2) + __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}} + { + int x[4]; + p = &x[2]; + if (!__builtin_is_within_lifetime(p)) + return false; + } + if (read_dangling == 3) + __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}} + std::nullptr_t* q; + { + std::nullptr_t np = nullptr; + q = &np; + if (!__builtin_is_within_lifetime(q)) + return false; + } + if (read_dangling == 4) + __builtin_is_within_lifetime(q); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}} + return true; +} +static_assert(test_automatic(0)); +static_assert(test_automatic(1)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{in call to 'test_automatic(1)'}} +static_assert(test_automatic(2)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{in call to 'test_automatic(2)'}} +static_assert(test_automatic(3)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{in call to 'test_automatic(3)'}} +static_assert(test_automatic(4)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{in call to 'test_automatic(4)'}} + + +consteval bool test_indeterminate() { + int x; + if (!__builtin_is_within_lifetime(&x)) + return false; + bool b = true; + unsigned char c = __builtin_bit_cast(unsigned char, b); + if (!__builtin_is_within_lifetime(&c)) + return false; + struct {} padding; + unsigned char y = __builtin_bit_cast(unsigned char, padding); + if (!__builtin_is_within_lifetime(&y)) + return false; + return true; +} +static_assert(test_indeterminate()); + +consteval bool test_volatile() { + int x; + if (!__builtin_is_within_lifetime(static_cast(&x)) || !__builtin_is_within_lifetime(static_cast(&x))) + return false; + volatile int y; + if (!__builtin_is_within_lifetime(const_cast(&y)) || !__builtin_is_within_lifetime(const_cast(static_cast(&y)))) + return false; + return true; +} +static_assert(test_volatile()); + +constexpr bool self = __builtin_is_within_lifetime(&self); +// expected-error@-1 {{constexpr variable 'self' must be initialized by a constant expression}} +// expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}} +// expected-error@-3 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}} +// expected-note@-4 {{initializer of 'self' is not a constant expression}} +// expected-note@-5 {{declared here}} +constexpr int external{}; +static_assert(__builtin_is_within_lifetime(&external)); +void not_constexpr() { + __builtin_is_within_lifetime(&external); +} +void invalid_args() { + __builtin_is_within_lifetime(static_cast(nullptr)); + // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}} + // expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a null pointer}} + + // FIXME: avoid function to pointer conversion on all consteval builtins + __builtin_is_within_lifetime(0); + // expected-error@-1 {{non-pointer argument to '__builtin_is_within_lifetime' is not allowed}} + // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} + __builtin_is_within_lifetime(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} + // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} + __builtin_is_within_lifetime(1, 2); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} + // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} + __builtin_is_within_lifetime(&external, &external); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} + // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} +} + +constexpr struct { + union { + int i; + char c; + }; + mutable int mi; // #x-mi +} x1{ .c = 2 }; +static_assert(!__builtin_is_within_lifetime(&x1.i)); +static_assert(__builtin_is_within_lifetime(&x1.c)); +static_assert(__builtin_is_within_lifetime(&x1.mi)); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{read of mutable member 'mi' is not allowed in a constant expression}} +// expected-note@#x-mi {{declared here}} + +constexpr struct NSDMI { // #NSDMI + bool a = true; + bool b = __builtin_is_within_lifetime(&a); // #NSDMI-read +} x2; +// expected-error@-1 {{constexpr variable 'x2' must be initialized by a constant expression}} +// expected-note@#NSDMI-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}} +// expected-note@-3 {{in call to 'NSDMI()'}} +// expected-error@-4 {{call to immediate function 'NSDMI::NSDMI' is not a constant expression}} +// expected-note@#NSDMI {{'NSDMI' is an immediate constructor because the default initializer of 'b' contains a call to a consteval function '__builtin_is_within_lifetime' and that call is not a constant expression}} +// expected-note@#NSDMI-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}} +// expected-note@-7 {{in call to 'NSDMI()'}} + +struct X3 { + consteval X3() { + __builtin_is_within_lifetime(this); // #X3-read + } +} x3; +// expected-error@-1 {{call to consteval function 'X3::X3' is not a constant expression}} +// expected-note@#X3-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}} +// expected-note@-3 {{in call to 'X3()'}} + +constexpr int i = 2; +static_assert(__builtin_is_within_lifetime(const_cast(&i))); +static_assert(__builtin_is_within_lifetime(const_cast(&i))); +static_assert(__builtin_is_within_lifetime(static_cast(&i))); + +constexpr int arr[2]{}; +static_assert(__builtin_is_within_lifetime(arr)); +static_assert(__builtin_is_within_lifetime(arr + 0)); +static_assert(__builtin_is_within_lifetime(arr + 1)); +void f() { + __builtin_is_within_lifetime(&i + 1); + // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}} + // expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a one-past-the-end pointer}} + __builtin_is_within_lifetime(arr + 2); + // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}} + // expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a one-past-the-end pointer}} +} + +template +consteval void disallow_function_types(bool b, const T* p) { + if (b) { + __builtin_is_within_lifetime(p); // expected-error {{function pointer argument to '__builtin_is_within_lifetime' is not allowed}} + } +} +void g() { + disallow_function_types(false, &f); + // expected-note@-1 {{in instantiation of function template specialization 'disallow_function_types' requested here}} +} + +struct OptBool { + union { bool b; char c; }; + + // note: this assumes common implementation properties for bool and char: + // * sizeof(bool) == sizeof(char), and + // * the value representations for true and false are distinct + // from the value representation for 2 + constexpr OptBool() : c(2) { } + constexpr OptBool(bool b) : b(b) { } + + constexpr auto has_value() const -> bool { + if consteval { // cxx20-warning {{consteval if}} + return __builtin_is_within_lifetime(&b); // during constant evaluation, cannot read from c + } else { + return c != 2; // during runtime, must read from c + } + } + + constexpr auto operator*() const -> const bool& { + return b; + } +}; + +constexpr OptBool disengaged; +constexpr OptBool engaged(true); +static_assert(!disengaged.has_value()); +static_assert(engaged.has_value()); +static_assert(*engaged); + +namespace vlas { + +consteval bool f(int n) { + int vla[n]; // cxx20-error {{variable of non-literal type}} + return __builtin_is_within_lifetime(static_cast(&vla)); +} +static_assert(f(1)); + +consteval bool fail(int n) { + int vla[n]; // cxx20-error {{variable of non-literal type}} + return __builtin_is_within_lifetime(&vla); // expected-error {{variable length arrays are not supported in '__builtin_is_within_lifetime'}} +} +static_assert(fail(1)); // sincecxx23-error {{static assertion expression is not an integral constant expression}} + +consteval bool variably_modified(int n) { + int(* p)[n]; + return __builtin_is_within_lifetime(&p); +} +static_assert(variably_modified(1)); + +} // namespace vlas + +consteval bool partial_arrays() { + int arr[2]; + if (!__builtin_is_within_lifetime(&arr) || !__builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1])) + return false; + std::destroy_at(&arr[0]); + if (!__builtin_is_within_lifetime(&arr) || __builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1])) + return false; + std::construct_at(&arr[0]); + if (!__builtin_is_within_lifetime(&arr) || !__builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1])) + return false; + return true; +} +static_assert(partial_arrays()); + +consteval bool partial_members() { + struct S { + int x; + int y; + } s; + if (!__builtin_is_within_lifetime(&s) || !__builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y)) + return false; + std::destroy_at(&s.x); + if (!__builtin_is_within_lifetime(&s) || __builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y)) + return false; + std::construct_at(&s.x); + if (!__builtin_is_within_lifetime(&s) || !__builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y)) + return false; + return true; +} + +struct NonTrivial { + constexpr NonTrivial() {} + constexpr NonTrivial(const NonTrivial&) {} + constexpr ~NonTrivial() {} +}; + +template +constexpr T& unmove(T&& temp) { return static_cast(temp); } + +consteval bool test_temporaries() { + static_assert(__builtin_is_within_lifetime(&unmove(0))); + static_assert(__builtin_is_within_lifetime(&unmove(NonTrivial{}))); + if (!__builtin_is_within_lifetime(&unmove(0))) + return false; + if (!__builtin_is_within_lifetime(&unmove(NonTrivial{}))) + return false; + return true; +} +static_assert(test_temporaries()); + +constexpr const int& temp = 0; +static_assert(__builtin_is_within_lifetime(&temp)); + +template +constexpr T* test_dangling() { + T i; // expected-note 2 {{declared here}} + return &i; // expected-warning 2 {{address of stack memory associated with local variable 'i' returned}} +} +static_assert(__builtin_is_within_lifetime(test_dangling())); // expected-note {{in instantiation of function template specialization}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{read of variable whose lifetime has ended}} +static_assert(__builtin_is_within_lifetime(test_dangling())); // expected-note {{in instantiation of function template specialization}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} +// expected-note@-2 {{read of variable whose lifetime has ended}} + +template +concept CanCallAndPassToIsWithinLifetime = std::bool_constant<__builtin_is_within_lifetime(F())>::value; +static_assert(CanCallAndPassToIsWithinLifetime<[]{ return &i; }>); +static_assert(!CanCallAndPassToIsWithinLifetime<[]{ return static_cast(nullptr); }>); +static_assert(!CanCallAndPassToIsWithinLifetime<[]{ return static_cast(&f); }>); +template constexpr std::true_type sfinae() requires CanCallAndPassToIsWithinLifetime { return {}; } +template std::false_type sfinae() { return {}; } +static_assert(decltype(sfinae<[]{ return &i; }>())::value); +static_assert(!decltype(sfinae<[]{ return static_cast(nullptr); }>())::value); +std::true_type(* not_immediate)() = &sfinae<[]{ return &i; }>; + +void test_std_error_message() { + std::is_within_lifetime(static_cast(nullptr)); + // expected-error@-1 {{call to consteval function 'std::is_within_lifetime' is not a constant expression}} + // expected-note@-2 {{'std::is_within_lifetime' cannot be called with a null pointer}} + // expected-note@-3 {{in call to 'is_within_lifetime(nullptr)'}} + std::is_within_lifetime(&test_std_error_message); + // expected-error@-1 {{no matching function for call to 'is_within_lifetime'}} + // expected-note@#std-definition {{candidate template ignored: constraints not satisfied [with T = void ()]}} + // expected-note@#std-constraint {{because '!is_function_v' evaluated to false}} + std::is_within_lifetime(arr + 2); + // expected-error@-1 {{call to consteval function 'std::is_within_lifetime' is not a constant expression}} + // expected-note@-2 {{'std::is_within_lifetime' cannot be called with a one-past-the-end pointer}} + // expected-note@-3 {{in call to 'is_within_lifetime(&arr[2])'}} +} +struct XStd { + consteval XStd() { + std::is_within_lifetime(this); // #XStd-read + } +} xstd; +// expected-error@-1 {{call to consteval function 'XStd::XStd' is not a constant expression}} +// expected-note@#XStd-read {{'std::is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}} +// expected-note@#XStd-read {{in call to 'is_within_lifetime(&)'}} +// expected-note@-4 {{in call to 'XStd()'}} diff --git a/clang/test/SemaCXX/consteval-builtin.cpp b/clang/test/SemaCXX/consteval-builtin.cpp new file mode 100644 index 0000000000000..3ba95b4dbd9b5 --- /dev/null +++ b/clang/test/SemaCXX/consteval-builtin.cpp @@ -0,0 +1,93 @@ +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -Wno-unused %s -verify=cxx20-cxx26 +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -Wno-unused %s -verify=cxx20,cxx20-cxx26 +// RUN: %clang_cc1 -std=c++17 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17 +// RUN: %clang_cc1 -std=c++14 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17 +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17 +// RUN: %clang_cc1 -std=c++03 -fsyntax-only -Wno-unused %s -verify=precxx20 +// RUN: %clang_cc1 -std=c++98 -fsyntax-only -Wno-unused %s -verify=precxx20 +// RUN: %clang_cc1 -x c -std=c23 -fsyntax-only -Wno-unused %s -verify=c + +#if __has_builtin(__builtin_is_within_lifetime) +#error has the builtin +#else +#error does not have the builtin +#endif +// cxx20-cxx26-error@-4 {{has the builtin}} +// precxx20-error@-3 {{does not have the builtin}} +// c-error@-4 {{does not have the builtin}} + +#if __has_constexpr_builtin(__builtin_is_within_lifetime) +#error has the constexpr builtin +#else +#error does not have the constexpr builtin +#endif +// cxx20-cxx26-error@-4 {{has the constexpr builtin}} +// precxx20-error@-3 {{does not have the constexpr builtin}} +// c-error@-4 {{does not have the constexpr builtin}} + +#if __cplusplus < 201103L +#define static_assert __extension__ _Static_assert +#define CONSTEXPR11 +#else +#define CONSTEXPR11 constexpr +#endif + +static const int i1 = 0; +static_assert(__builtin_is_within_lifetime(&i1), ""); +// precxx20-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}} +// c-error@-2 {{use of undeclared identifier '__builtin_is_within_lifetime'}} + +#if !defined(__cplusplus) || __cplusplus >= 201102L +constexpr int i2 = 0; +static_assert(__builtin_is_within_lifetime(&i2), ""); +// cxx11-cxx17-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}} +// c-error@-2 {{use of undeclared identifier '__builtin_is_within_lifetime'}} +#endif + +#ifdef __cplusplus +template +CONSTEXPR11 bool f1(T i) { // #f1 + return __builtin_is_within_lifetime(&i); // #f1-consteval-call +} + +bool(&fp1)(int) = f1; +// cxx20-cxx26-error@-1 {{cannot take address of immediate function 'f1' outside of an immediate invocation}} +// cxx20-cxx26-note@#f1 {{declared here}} +// cxx20-cxx26-note@#f1-consteval-call {{'f1' is an immediate function because its body contains a call to a consteval function '__builtin_is_within_lifetime' and that call is not a constant expression}} +// precxx20-error@#f1-consteval-call {{use of undeclared identifier '__builtin_is_within_lifetime'}} +// precxx20-note@-5 {{in instantiation of function template specialization 'f1' requested here}} +#else +void f1(int i) { + __builtin_is_within_lifetime(&i); + // c-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}} +} +#endif + +#if __cplusplus >= 202002L +constexpr void f2() { + int i = 0; + if consteval { // cxx20-warning {{consteval if}} + __builtin_is_within_lifetime(&i); + } +} +void(&fp2)() = f2; + +constexpr void f3() { + __builtin_is_within_lifetime(&i1); +} +void(&fp3)() = f3; + +constexpr void f4() { + &__builtin_is_within_lifetime; + // cxx20-cxx26-error@-1 {{builtin functions must be directly called}} + // cxx20-cxx26-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} + __builtin_is_within_lifetime(); + // cxx20-cxx26-error@-1 {{too few arguments to function call, expected 1, have 0}} + // cxx20-cxx26-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}} + int* not_constexpr; + __builtin_is_within_lifetime(not_constexpr); + // cxx20-cxx26-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}} + // cxx20-cxx26-note@-2 {{read of non-constexpr variable 'not_constexpr' is not allowed in a constant expression}} + // cxx20-cxx26-note@-4 {{declared here}} +} +#endif From e4fdbcc28f19b59fef065f2a6f939f91f286b9a8 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 4 Sep 2024 17:27:46 -0400 Subject: [PATCH 218/425] [libc++] Add miscellaneous missing includes --- libcxx/include/__memory/uninitialized_algorithms.h | 1 + libcxx/include/forward_list | 1 + libcxx/include/list | 1 + libcxx/include/string | 1 + 4 files changed, 4 insertions(+) diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h index 72db3a266fdd4..8ff87e28b3bb5 100644 --- a/libcxx/include/__memory/uninitialized_algorithms.h +++ b/libcxx/include/__memory/uninitialized_algorithms.h @@ -22,6 +22,7 @@ #include <__memory/construct_at.h> #include <__memory/pointer_traits.h> #include <__memory/voidify.h> +#include <__type_traits/enable_if.h> #include <__type_traits/extent.h> #include <__type_traits/is_array.h> #include <__type_traits/is_constant_evaluated.h> diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 6c0dc5f96a5d5..3187b11e4dde7 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -218,6 +218,7 @@ template #include <__ranges/container_compatible_range.h> #include <__ranges/from_range.h> #include <__type_traits/conditional.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_allocator.h> #include <__type_traits/is_const.h> #include <__type_traits/is_nothrow_assignable.h> diff --git a/libcxx/include/list b/libcxx/include/list index 76b1d9241b41c..2aa774451ec2a 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -225,6 +225,7 @@ template #include <__ranges/container_compatible_range.h> #include <__ranges/from_range.h> #include <__type_traits/conditional.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_allocator.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> diff --git a/libcxx/include/string b/libcxx/include/string index 5cb0693ad10bc..3480b57375c11 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -621,6 +621,7 @@ basic_string operator""s( const char32_t *str, size_t len ); #include <__string/char_traits.h> #include <__string/extern_template_lists.h> #include <__type_traits/conditional.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_allocator.h> #include <__type_traits/is_array.h> #include <__type_traits/is_convertible.h> From 2c3da172d1869a2e261af38c45582027a9ff6af7 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 5 Sep 2024 14:01:12 +0100 Subject: [PATCH 219/425] LIR: strip unused LAA header dependency (NFC) (#107382) LoopIdiomRecognize does not use LoopAccessAnalysis. Make this clear. --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 0ee1afa76a823..578d087e470e1 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -40,7 +40,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" -#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" From 3d01f0a33b9a14545217938fbd2475226ade2719 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Thu, 5 Sep 2024 15:03:22 +0200 Subject: [PATCH 220/425] [mlir][gpu] Add 'cluster_stride' attribute to gpu.subgroup_reduce (#107142) Follow-up to 7aa22f013e24d20291aad745368ff907baa9dfa4, adding an additional attribute needed in some applications. --- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 32 +++++-- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 13 ++- .../GPU/Transforms/SubgroupReduceLowering.cpp | 90 +++++++++++-------- mlir/test/Dialect/GPU/canonicalize.mlir | 2 +- mlir/test/Dialect/GPU/invalid.mlir | 21 ++++- .../Dialect/GPU/subgroup-reduce-lowering.mlir | 47 +++++++--- 6 files changed, 144 insertions(+), 61 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index d2a5e5d77ad84..6098eb34d04d5 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1200,10 +1200,12 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType] The `subgroup_reduce` op reduces the values of lanes (work items) across a subgroup. - The subgroup is divided into clusters of `cluster_size` contiguous lanes - each, and a reduction is done for every lane of each cluster (in parallel). - The result is equal for all lanes in a cluster. When `cluster_size` is - omitted, there is a single cluster covering the entire subgroup. + The subgroup is divided into clusters starting at lane index 0. Within each + cluster, there are `size` lanes, and the lane index advances by `stride`. + A reduction is done for each cluster in parallel: every lane in the cluster + is reduced, and the result is equal for all lanes in the cluster. If `size` + is omitted, there is a single cluster covering the entire subgroup. If + `stride` is omitted, the stride is 1 (the cluster's lanes are contiguous). When the reduced value is of a vector type, each vector element is reduced independently. Only 1-d vector types are allowed. @@ -1213,7 +1215,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType] ```mlir %1 = gpu.subgroup_reduce add %a : (f32) -> f32 %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16> - %3 = gpu.subgroup_reduce add %c cluster_size(4) : (f32) -> f32 + %3 = gpu.subgroup_reduce add %c cluster(size = 4) : (f32) -> f32 + %3 = gpu.subgroup_reduce add %c cluster(size = 4, stride = 2) : (f32) -> f32 ``` If `uniform` flag is set either none or all lanes of a subgroup need to execute @@ -1230,7 +1233,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType] AnyIntegerOrFloatOr1DVector:$value, GPU_AllReduceOperationAttr:$op, UnitAttr:$uniform, - OptionalAttr:$cluster_size + OptionalAttr:$cluster_size, + DefaultValuedAttr:$cluster_stride ); let results = (outs AnyIntegerOrFloatOr1DVector:$result); @@ -1238,19 +1242,29 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType] OpBuilder<(ins "Value":$value, "::mlir::gpu::AllReduceOperation":$op, "bool":$uniform), [{ - build($_builder, $_state, value, op, uniform, /*cluster_size=*/ nullptr); + build($_builder, $_state, value, op, uniform, std::nullopt); }]>, OpBuilder<(ins "Value":$value, "::mlir::gpu::AllReduceOperation":$op, "bool":$uniform, "std::optional":$cluster_size), [{ - build($_builder, $_state, value, op, uniform, cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr); + build($_builder, $_state, value, op, uniform, + cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr); + }]>, + OpBuilder<(ins "Value":$value, + "::mlir::gpu::AllReduceOperation":$op, + "bool":$uniform, + "std::optional":$cluster_size, + "uint32_t":$cluster_stride), [{ + build($_builder, $_state, value, op, uniform, + cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr, + cluster_stride); }]> ]; let assemblyFormat = [{ custom($op) $value (`uniform` $uniform^)? - (`cluster_size` `(` $cluster_size^ `)`)? + (`cluster` `(` `size` `=` $cluster_size^ (`,` `stride` `=` $cluster_stride^)? `)`)? attr-dict `:` functional-type(operands, results) }]; diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index e45ba7838b453..f822c11aeec00 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -621,7 +621,8 @@ LogicalResult gpu::SubgroupReduceOp::verify() { << getType(); } - if (auto clusterSize = getClusterSize()) { + auto clusterSize = getClusterSize(); + if (clusterSize) { uint32_t size = *clusterSize; if (!llvm::isPowerOf2_32(size)) { return emitOpError() << "cluster size " << size @@ -629,6 +630,16 @@ LogicalResult gpu::SubgroupReduceOp::verify() { } } + uint32_t stride = getClusterStride(); + if (stride != 1 && !clusterSize) { + return emitOpError() << "cluster stride can only be specified if cluster " + "size is specified"; + } + if (!llvm::isPowerOf2_32(stride)) { + return emitOpError() << "cluster stride " << stride + << " is not a power of two"; + } + return success(); } diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 288f7ab9f3022..2cf5d12588a73 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -50,8 +50,6 @@ struct BreakDownSubgroupReduce final : OpRewritePattern { LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { - std::optional clusterSize = op.getClusterSize(); - auto vecTy = dyn_cast(op.getType()); if (!vecTy || vecTy.getNumElements() < 2) return rewriter.notifyMatchFailure(op, "not a multi-element reduction"); @@ -97,7 +95,8 @@ struct BreakDownSubgroupReduce final : OpRewritePattern { } Value reduce = rewriter.create( - loc, extracted, op.getOp(), op.getUniform(), clusterSize); + loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(), + op.getClusterStride()); if (numElems == 1) { res = rewriter.create(loc, reduce, res, startIdx); continue; @@ -129,8 +128,6 @@ struct ScalarizeSingleElementReduce final LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { - std::optional clusterSize = op.getClusterSize(); - auto vecTy = dyn_cast(op.getType()); if (!vecTy || vecTy.getNumElements() != 1) return rewriter.notifyMatchFailure(op, "not a single-element reduction"); @@ -140,34 +137,64 @@ struct ScalarizeSingleElementReduce final Location loc = op.getLoc(); Value extracted = rewriter.create(loc, op.getValue(), 0); Value reduce = rewriter.create( - loc, extracted, op.getOp(), op.getUniform(), clusterSize); + loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(), + op.getClusterStride()); rewriter.replaceOpWithNewOp(op, vecTy, reduce); return success(); } }; +struct ClusterInfo { + unsigned clusterStride; + unsigned clusterSize; + unsigned subgroupSize; +}; + +static FailureOr +getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { + assert(llvm::isPowerOf2_32(subgroupSize)); + + std::optional clusterSize = op.getClusterSize(); + assert(!clusterSize || + llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. + if (clusterSize && *clusterSize > subgroupSize) + return op.emitOpError() + << "cluster size " << *clusterSize + << " is greater than subgroup size " << subgroupSize; + unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + + auto clusterStride = op.getClusterStride(); + assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. + if (clusterStride >= subgroupSize) + return op.emitOpError() + << "cluster stride " << clusterStride + << " is not less than subgroup size " << subgroupSize; + + return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +} + /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn` /// and `unpackFn` to convert to the native shuffle type and to the reduction /// type, respectively. For example, with `input` of type `f16`, `packFn` could /// build ops to cast the value to `i32` to perform shuffles, while `unpackFn` /// would cast it back to `f16` to perform arithmetic reduction on. Assumes that /// the subgroup is `subgroupSize` lanes wide and divides it into clusters of -/// `clusterSize` lanes, reducing all lanes in each cluster in parallel. -static Value createSubgroupShuffleReduction( - OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode, - unsigned clusterSize, unsigned subgroupSize, - function_ref packFn, function_ref unpackFn) { - assert(llvm::isPowerOf2_32(clusterSize)); - assert(llvm::isPowerOf2_32(subgroupSize)); - assert(clusterSize <= subgroupSize); +/// `clusterSize` lanes starting at lane 0 with a stride of `clusterStride` for +/// lanes within a cluster, reducing all lanes in each cluster in parallel. +Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc, + Value input, gpu::AllReduceOperation mode, + const ClusterInfo &ci, + function_ref packFn, + function_ref unpackFn) { // Lane value always stays in the original type. We use it to perform arith // reductions. Value laneVal = input; // Parallel reduction using butterfly shuffles. - for (unsigned i = 1; i < clusterSize; i <<= 1) { + for (unsigned i = ci.clusterStride; i < ci.clusterStride * ci.clusterSize; + i <<= 1) { Value shuffled = builder .create(loc, packFn(laneVal), i, - /*width=*/subgroupSize, + /*width=*/ci.subgroupSize, /*mode=*/gpu::ShuffleMode::XOR) .getShuffleResult(); laneVal = vector::makeArithReduction(builder, loc, @@ -190,12 +217,9 @@ struct ScalarSubgroupReduceToShuffles final LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { - std::optional clusterSize = op.getClusterSize(); - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + auto ci = getAndValidateClusterInfo(op, subgroupSize); + if (failed(ci)) + return failure(); Type valueTy = op.getType(); unsigned elemBitwidth = @@ -209,9 +233,8 @@ struct ScalarSubgroupReduceToShuffles final if (elemBitwidth == shuffleBitwidth) { auto identityFn = [](Value v) { return v; }; rewriter.replaceOp(op, createSubgroupShuffleReduction( - rewriter, loc, op.getValue(), op.getOp(), - effectiveClusterSize, subgroupSize, identityFn, - identityFn)); + rewriter, loc, op.getValue(), op.getOp(), *ci, + identityFn, identityFn)); return success(); } @@ -232,8 +255,7 @@ struct ScalarSubgroupReduceToShuffles final rewriter.replaceOp( op, createSubgroupShuffleReduction(rewriter, loc, op.getValue(), - op.getOp(), effectiveClusterSize, - subgroupSize, packFn, unpackFn)); + op.getOp(), *ci, packFn, unpackFn)); return success(); } @@ -253,12 +275,9 @@ struct VectorSubgroupReduceToShuffles final LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { - std::optional clusterSize = op.getClusterSize(); - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + auto ci = getAndValidateClusterInfo(op, subgroupSize); + if (failed(ci)) + return failure(); auto vecTy = dyn_cast(op.getType()); if (!vecTy) @@ -308,9 +327,8 @@ struct VectorSubgroupReduceToShuffles final return rewriter.create(loc, extendedVecTy, asIntVec); }; - Value res = createSubgroupShuffleReduction(rewriter, loc, extendedInput, - op.getOp(), effectiveClusterSize, - subgroupSize, packFn, unpackFn); + Value res = createSubgroupShuffleReduction( + rewriter, loc, extendedInput, op.getOp(), *ci, packFn, unpackFn); if (vecBitwidth < shuffleBitwidth) { res = rewriter.create( diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir index 469c03c9460df..d342ae9df10ee 100644 --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -255,7 +255,7 @@ func.func @subgroup_reduce_cluster_size_1() { gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { %1 = "test.test2"() : () -> i32 - %2 = gpu.subgroup_reduce add %1 cluster_size(1) : (i32) -> (i32) + %2 = gpu.subgroup_reduce add %1 cluster(size=1) : (i32) -> (i32) "test.test3"(%2) : (i32) -> () gpu.terminator } diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index fd7618020b5d8..2a0f7e8c6b10c 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -335,7 +335,7 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) { func.func @subgroup_reduce_zero_cluster_size(%arg0 : vector<4xf32>) { // expected-error@+1 {{cluster size 0 is not a power of two}} - %res = gpu.subgroup_reduce add %arg0 cluster_size(0) : (vector<4xf32>) -> vector<4xf32> + %res = gpu.subgroup_reduce add %arg0 cluster(size = 0) : (vector<4xf32>) -> vector<4xf32> return } @@ -343,10 +343,27 @@ func.func @subgroup_reduce_zero_cluster_size(%arg0 : vector<4xf32>) { func.func @subgroup_reduce_npot_cluster_size(%arg0 : vector<4xf32>) { // expected-error@+1 {{cluster size 3 is not a power of two}} - %res = gpu.subgroup_reduce add %arg0 cluster_size(3) : (vector<4xf32>) -> vector<4xf32> + %res = gpu.subgroup_reduce add %arg0 cluster(size = 3) : (vector<4xf32>) -> vector<4xf32> return } +// ----- + +func.func @subgroup_reduce_zero_cluster_stride(%arg0 : vector<4xf32>) { + // expected-error@+1 {{cluster stride 0 is not a power of two}} + %res = gpu.subgroup_reduce add %arg0 cluster(size = 4, stride = 0) : (vector<4xf32>) -> vector<4xf32> + return +} + +// ----- + +func.func @subgroup_reduce_cluster_stride_without_size(%arg0 : vector<4xf32>) { + // expected-error@+1 {{cluster stride can only be specified if cluster size is specified}} + %res = gpu.subgroup_reduce add %arg0 { cluster_stride = 2 : i32 } : (vector<4xf32>) -> vector<4xf32> + return +} + + // ----- func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) { diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 37608ce4cfed7..9f2aa1be52fc3 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -34,14 +34,14 @@ gpu.module @kernels { %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum1) : (vector<5xf16>) -> () - // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster_size(4) + // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" - %sum2 = gpu.subgroup_reduce mul %arg0 cluster_size(4) : (vector<5xf16>) -> (vector<5xf16>) + %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () - // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform cluster_size(4) + // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform cluster(size = 4, stride = 2) // CHECK-SUB: "test.consume" - %sum3 = gpu.subgroup_reduce mul %arg0 uniform cluster_size(4) : (vector<5xf16>) -> (vector<5xf16>) + %sum3 = gpu.subgroup_reduce mul %arg0 uniform cluster(size = 4, stride = 2) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum3) : (vector<5xf16>) -> () // CHECK-SUB: gpu.return @@ -65,14 +65,15 @@ gpu.module @kernels { %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum1) : (vector<1xf32>) -> () - // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster_size(8) : (f32) -> f32 + // Note stride is dropped because it is == 1. + // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" - %sum2 = gpu.subgroup_reduce add %arg0 cluster_size(8) : (vector<1xf32>) -> (vector<1xf32>) + %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () - // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster_size(8) : (f32) -> f32 + // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" - %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster_size(8) : (vector<1xf32>) -> (vector<1xf32>) + %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () // CHECK-SUB: gpu.return @@ -143,7 +144,29 @@ gpu.module @kernels { // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32 // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () - %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(8) : (i32) -> i32 + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 + "test.consume"(%sum0) : (i32) -> () + + // CHECK-SHFL: gpu.return + gpu.return + } + + // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { + // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 + // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 + // CHECK-SHFL-DAG: %[[C4:.+]] = arith.constant 16 : i32 + // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32 + + // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[ARG0]], %[[C1]], %[[C32]] : i32 + // CHECK-SHFL: %[[A0:.+]] = arith.addi %[[ARG0]], %[[S0]] : i32 + // CHECK-SHFL: %[[S1:.+]], %{{.+}} = gpu.shuffle xor %[[A0]], %[[C2]], %[[C32]] : i32 + // CHECK-SHFL: %[[A1:.+]] = arith.addi %[[A0]], %[[S1]] : i32 + // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32 + // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 + // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 4) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () // CHECK-SHFL: gpu.return @@ -194,7 +217,7 @@ gpu.module @kernels { // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32 // CHECK-SHFL-COUNT-2: gpu.shuffle xor - %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(4) : (vector<2xf16>) -> (vector<2xf16>) + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 4) : (vector<2xf16>) -> (vector<2xf16>) "test.consume"(%sum0) : (vector<2xf16>) -> () // CHECK-SHFL: gpu.return @@ -234,7 +257,7 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(16) : (i16) -> i16 + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () // CHECK-SHFL: gpu.return @@ -268,7 +291,7 @@ gpu.module @kernels { // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel { // CHECK-SHFL-COUNT-5: gpu.shuffle xor - %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(32) : (vector<3xi8>) -> (vector<3xi8>) + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>) "test.consume"(%sum0) : (vector<3xi8>) -> () // CHECK-SHFL: gpu.return From 7d1a68178ef4332c9bf19a5c959a3ec4cef0285d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 15:10:00 +0200 Subject: [PATCH 221/425] [SystemZ] Use APInt::getAllOnes() This was using -1 without setting the signed flag. Split off from https://github.com/llvm/llvm-project/pull/80309. --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 6f84bd6c6e4ff..582a8c139b293 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7981,7 +7981,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts, break; } case Intrinsic::s390_vperm: - SrcDemE = APInt(NumElts, -1); + SrcDemE = APInt::getAllOnes(NumElts); break; default: llvm_unreachable("Unhandled intrinsic."); From 67e19e5bb11d8ed2f1b5a0b8145331c8bf4522e9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 15:17:25 +0200 Subject: [PATCH 222/425] [flang] Set isSigned=true for negative constant (NFC) We're providing this as a negative signed value, so set the flag. Currently doesn't make a difference, but will assert in the future. Split out of https://github.com/llvm/llvm-project/pull/80309. --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index eb91969236ae0..ac521ae95df39 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1262,7 +1262,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion { } else { auto maskAttr = mlir::IntegerAttr::get( rewriter.getIntegerType(8, /*isSigned=*/false), - llvm::APInt(8, (uint64_t)~_CFI_ADDENDUM_FLAG, /*isSigned=*/false)); + llvm::APInt(8, (uint64_t)~_CFI_ADDENDUM_FLAG, /*isSigned=*/true)); mlir::LLVM::ConstantOp mask = rewriter.create( loc, rewriter.getI8Type(), maskAttr); extraField = rewriter.create(loc, extraField, mask); From 9e9971b100e121b83f1de9e9206cddb52cda4815 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 15:19:16 +0200 Subject: [PATCH 223/425] [PatternMatchTest] Use APInt::getAllOnes() (NFC) Split out from https://github.com/llvm/llvm-project/pull/80309 to avoid assertion failures in the future. --- llvm/unittests/IR/PatternMatch.cpp | 202 ++++++++++++++--------------- 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 379f97fb63139..13f121a2b9c7d 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -71,7 +71,7 @@ TEST_F(PatternMatchTest, SpecificIntEQ) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, 0)) @@ -93,15 +93,15 @@ TEST_F(PatternMatchTest, SpecificIntEQ) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1)) - .match(One)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntNE) { @@ -110,7 +110,7 @@ TEST_F(PatternMatchTest, SpecificIntNE) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_FALSE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, 0)) @@ -132,15 +132,15 @@ TEST_F(PatternMatchTest, SpecificIntNE) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1)) - .match(One)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntUGT) { @@ -149,7 +149,7 @@ TEST_F(PatternMatchTest, SpecificIntUGT) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_FALSE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, 0)) @@ -171,15 +171,15 @@ TEST_F(PatternMatchTest, SpecificIntUGT) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1)) - .match(One)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SignbitZeroChecks) { @@ -187,7 +187,7 @@ TEST_F(PatternMatchTest, SignbitZeroChecks) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE(m_Negative().match(NegOne)); EXPECT_FALSE(m_NonNegative().match(NegOne)); @@ -211,7 +211,7 @@ TEST_F(PatternMatchTest, SpecificIntUGE) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, 0)) @@ -233,15 +233,15 @@ TEST_F(PatternMatchTest, SpecificIntUGE) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1)) - .match(One)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntULT) { @@ -250,7 +250,7 @@ TEST_F(PatternMatchTest, SpecificIntULT) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_FALSE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, 0)) @@ -272,15 +272,15 @@ TEST_F(PatternMatchTest, SpecificIntULT) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1)) - .match(One)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntULE) { @@ -289,7 +289,7 @@ TEST_F(PatternMatchTest, SpecificIntULE) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, 0)) @@ -311,15 +311,15 @@ TEST_F(PatternMatchTest, SpecificIntULE) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1)) - .match(One)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntSGT) { @@ -328,7 +328,7 @@ TEST_F(PatternMatchTest, SpecificIntSGT) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_FALSE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, 0)) @@ -350,15 +350,15 @@ TEST_F(PatternMatchTest, SpecificIntSGT) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1)) - .match(One)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntSGE) { @@ -367,7 +367,7 @@ TEST_F(PatternMatchTest, SpecificIntSGE) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, 0)) @@ -389,15 +389,15 @@ TEST_F(PatternMatchTest, SpecificIntSGE) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1)) - .match(One)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntSLT) { @@ -406,7 +406,7 @@ TEST_F(PatternMatchTest, SpecificIntSLT) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_FALSE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, 0)) @@ -428,15 +428,15 @@ TEST_F(PatternMatchTest, SpecificIntSLT) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1)) - .match(One)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, SpecificIntSLE) { @@ -445,7 +445,7 @@ TEST_F(PatternMatchTest, SpecificIntSLE) { Value *Zero = ConstantInt::get(IntTy, 0); Value *One = ConstantInt::get(IntTy, 1); - Value *NegOne = ConstantInt::get(IntTy, -1); + Value *NegOne = Constant::getAllOnesValue(IntTy); EXPECT_TRUE( m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, 0)) @@ -467,15 +467,15 @@ TEST_F(PatternMatchTest, SpecificIntSLE) { m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, 1)) .match(NegOne)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1)) - .match(Zero)); - EXPECT_FALSE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1)) - .match(One)); - EXPECT_TRUE( - m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1)) - .match(NegOne)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, + APInt::getAllOnes(BitWidth)) + .match(Zero)); + EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, + APInt::getAllOnes(BitWidth)) + .match(One)); + EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, + APInt::getAllOnes(BitWidth)) + .match(NegOne)); } TEST_F(PatternMatchTest, Unless) { From eae1d6152fd77511f943fd7f300a971c53453e70 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Sep 2024 14:28:37 +0100 Subject: [PATCH 224/425] [X86] Add test coverage for #107289 --- .../CodeGen/X86/vector-shuffle-combining.ll | 56 ++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index b5adfb3733357..923af983f1d47 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3533,6 +3533,58 @@ define <4 x i32> @PR63700(i128 %0) { ret <4 x i32> %shuffle.i11 } +define <16 x i8> @PR107289(<16 x i8> %0) { +; SSE2-LABEL: PR107289: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: shldq $8, %rax, %rcx +; SSE2-NEXT: shlq $8, %rax +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR107289: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: shldq $8, %rax, %rcx +; SSSE3-NEXT: shlq $8, %rax +; SSSE3-NEXT: movq %rcx, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR107289: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: shldq $8, %rcx, %rax +; SSE41-NEXT: shlq $8, %rcx +; SSE41-NEXT: movq %rax, %xmm1 +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: PR107289: +; AVX: # %bb.0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vmovq %xmm0, %rcx +; AVX-NEXT: shldq $8, %rcx, %rax +; AVX-NEXT: shlq $8, %rcx +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %src = bitcast <16 x i8> %0 to i128 + %shl = shl i128 %src, 8 + %res = bitcast i128 %shl to <16 x i8> + ret <16 x i8> %res +} + ; Test case reported on D105827 define void @SpinningCube() { ; SSE2-LABEL: SpinningCube: @@ -3641,9 +3693,9 @@ define void @autogen_SD25931() { ; CHECK-LABEL: autogen_SD25931: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB141_1: # %CF242 +; CHECK-NEXT: .LBB142_1: # %CF242 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp .LBB141_1 +; CHECK-NEXT: jmp .LBB142_1 BB: %Cmp16 = icmp uge <2 x i1> zeroinitializer, zeroinitializer %Shuff19 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp16, <2 x i32> From 1a1264726db275d4b207c5bc640e2779dd484478 Mon Sep 17 00:00:00 2001 From: Robin Caloudis Date: Thu, 5 Sep 2024 15:44:16 +0200 Subject: [PATCH 225/425] [libc++][math] Add `constexpr` for `std::signbit()` (#105946) ## Why Since 18th of August, the floating point comparison builtin ``__builtin_signbit`` is available in Clang as constant expression (https://github.com/llvm/llvm-project/pull/94118). ## What * Implement `constexpr` for `std::signbit()` as defined by [P0533R9](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p0533r9.pdf) (new C++23 feature) * Restrict execution of tests to tip-of-trunk Clang as builtin is not yet available (note that builtin is available in GCC) --- libcxx/include/__math/traits.h | 13 +++- .../c.math/constexpr-cxx23-clang.pass.cpp | 7 ++ .../c.math/constexpr-cxx23-gcc.pass.cpp | 6 +- .../test/std/numerics/c.math/signbit.pass.cpp | 78 +++++++++++++++++++ 4 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 libcxx/test/std/numerics/c.math/signbit.pass.cpp diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h index 0638a6949580e..3d4f14fc9cd55 100644 --- a/libcxx/include/__math/traits.h +++ b/libcxx/include/__math/traits.h @@ -27,18 +27,25 @@ namespace __math { // signbit +// TODO(LLVM 22): Remove conditional once support for Clang 19 is dropped. +#if defined(_LIBCPP_COMPILER_GCC) || __has_constexpr_builtin(__builtin_signbit) +# define _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_CONSTEXPR_SINCE_CXX23 +#else +# define _LIBCPP_SIGNBIT_CONSTEXPR +#endif + template ::value, int> = 0> -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { return __builtin_signbit(__x); } template ::value && is_signed<_A1>::value, int> = 0> -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { return __x < 0; } template ::value && !is_signed<_A1>::value, int> = 0> -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1) _NOEXCEPT { return false; } diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp index a07260a34516f..3f17f21e8c108 100644 --- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp +++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp @@ -220,9 +220,16 @@ int main(int, char**) { ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1); ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1); +// TODO(LLVM 22): Remove `__has_constexpr_builtin` conditional once support for Clang 19 is dropped. +#if !__has_constexpr_builtin(__builtin_signbit) ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); +#else + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); +#endif ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0); ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0); diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp index 8c481f41a945e..d8779706bcee2 100644 --- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp +++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp @@ -217,9 +217,9 @@ int main(int, char**) { ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1); ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1); - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); + ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0); ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0); diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp new file mode 100644 index 0000000000000..c85033e363ce5 --- /dev/null +++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// bool signbit(floating-point-type x); // constexpr since C++23 + +// We don't control the implementation on windows +// UNSUPPORTED: windows + +// These compilers don't support constexpr `__builtin_signbit` yet. +// UNSUPPORTED: clang-17, clang-18, clang-19, apple-clang-15, apple-clang-16 + +#include +#include +#include + +#include "test_macros.h" +#include "type_algorithms.h" + +struct TestFloat { + template + static TEST_CONSTEXPR_CXX23 bool test() { + assert(!std::signbit(T(0))); + assert(!std::signbit(std::numeric_limits::min())); + assert(!std::signbit(std::numeric_limits::denorm_min())); + assert(!std::signbit(std::numeric_limits::max())); + assert(!std::signbit(std::numeric_limits::infinity())); + assert(!std::signbit(std::numeric_limits::quiet_NaN())); + assert(!std::signbit(std::numeric_limits::signaling_NaN())); + assert(std::signbit(-T(0))); + assert(std::signbit(-std::numeric_limits::infinity())); + assert(std::signbit(std::numeric_limits::lowest())); + + return true; + } + + template + TEST_CONSTEXPR_CXX23 void operator()() { + test(); +#if TEST_STD_VER >= 23 + static_assert(test()); +#endif + } +}; + +struct TestInt { + template + static TEST_CONSTEXPR_CXX23 bool test() { + assert(!std::signbit(std::numeric_limits::max())); + assert(!std::signbit(T(0))); + if (std::is_unsigned::value) { + assert(!std::signbit(std::numeric_limits::lowest())); + } else { + assert(std::signbit(std::numeric_limits::lowest())); + } + + return true; + } + + template + TEST_CONSTEXPR_CXX23 void operator()() { + test(); +#if TEST_STD_VER >= 23 + static_assert(test()); +#endif + } +}; + +int main(int, char**) { + types::for_each(types::floating_point_types(), TestFloat()); + types::for_each(types::integral_types(), TestInt()); + + return 0; +} From 7f0c5b0502b462d2afad32d3681b37cfc15ba844 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Thu, 5 Sep 2024 14:47:10 +0100 Subject: [PATCH 226/425] [AArch64]Fix invalid use of ld1/st1 in stack alloc (#105518) This patch fixes incorrect usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by [c4bac7f](https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9). This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly. This commit was also using incorrect offsets when using ld1/st1. --- .../Target/AArch64/AArch64FrameLowering.cpp | 27 +- llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 68 +- .../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 1456 +++++++++++------ .../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 1456 +++++++++++------ .../AArch64/sve-callee-save-restore-pairs.ll | 84 +- 5 files changed, 1952 insertions(+), 1139 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ad20e76d0fe2e..7e041b086599b 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs( ByteOffset += StackFillDir * StackHazardSize; LastReg = RPI.Reg1; + int Scale = RPI.getScale(); // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); @@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs( case RegPairInfo::PPR: break; case RegPairInfo::ZPR: - if (AFI->getPredicateRegForFillSpill() != 0) - if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) + if (AFI->getPredicateRegForFillSpill() != 0 && + ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) { + // Calculate offset of register pair to see if pair instruction can be + // used. + int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale; + if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0)) RPI.Reg2 = NextReg; + } break; case RegPairInfo::VG: break; @@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs( if (NeedsWinCFI && RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); - int Scale = RPI.getScale(); int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(OffsetPre % Scale == 0); @@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineMemOperand::MOStore, Size, Alignment)); MIB.addReg(PnReg); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale], - // where factor*scale is implicit + .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale], + // where 2*vscale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale], - // where factor*scale is implicit + .addImm(RPI.Offset) // [sp, #offset*vscale], + // where factor*vscale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); MIB.addReg(PnReg); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale] - // where factor*scale is implicit + .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale] + // where 2*vscale is implicit .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( } MIB.addReg(Reg1, getDefRegState(true)); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale] - // where factor*scale is implicit + .addImm(RPI.Offset) // [sp, #offset*vscale] + // where factor*vscale is implicit .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index a96f9e382ed1a..8724e7c1c368d 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -332,16 +332,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -349,7 +349,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG @@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG ; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -427,16 +429,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: ptrue pn8.b ; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -444,7 +446,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG @@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: addvl sp, sp, #1 ; FP-CHECK-NEXT: ptrue pn8.b +; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index 29d3d68fc4c3d..c63899cf7d257 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -55,31 +55,45 @@ define @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -477,14 +569,20 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -497,15 +595,21 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -573,31 +677,45 @@ define @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -736,14 +880,20 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -756,15 +906,21 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -832,31 +988,45 @@ define @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -995,14 +1191,20 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1015,15 +1217,21 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1093,32 +1301,46 @@ define @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1271,13 +1517,18 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1295,14 +1546,19 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1376,32 +1632,46 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1413,13 +1683,18 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1437,14 +1712,19 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1518,32 +1798,46 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1555,13 +1849,18 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1579,14 +1878,19 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1660,32 +1964,46 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1697,13 +2015,18 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1721,14 +2044,19 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1802,32 +2130,46 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1839,13 +2181,18 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1863,14 +2210,19 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1944,32 +2296,46 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1981,13 +2347,18 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2005,14 +2376,19 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2086,32 +2462,46 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2123,13 +2513,18 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2147,14 +2542,19 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 3d3748e101122..05241f788d3ea 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -8,31 +8,45 @@ define @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -124,14 +164,20 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -144,15 +190,21 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -173,31 +225,45 @@ define @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8_scalar( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -289,14 +381,20 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -309,15 +407,21 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -338,31 +442,45 @@ define @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -454,14 +598,20 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -474,15 +624,21 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -503,31 +659,45 @@ define @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -619,14 +815,20 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -639,15 +841,21 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -668,32 +876,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -705,13 +927,18 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -729,14 +956,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -760,32 +992,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -797,13 +1043,18 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -821,14 +1072,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -853,32 +1109,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -890,13 +1160,18 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -914,14 +1189,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -945,32 +1225,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -982,13 +1276,18 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1006,14 +1305,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1038,32 +1342,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1075,13 +1393,18 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1099,14 +1422,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1130,32 +1458,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1167,13 +1509,18 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1191,14 +1538,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1223,32 +1575,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1260,13 +1626,18 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1284,14 +1655,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1315,32 +1691,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1352,13 +1742,18 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1376,14 +1771,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll index 470c0dd45782c..c9d216935edbf 100644 --- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll @@ -91,16 +91,16 @@ define void @fbyte( %v) { ; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -108,7 +108,8 @@ define void @fbyte( %v) { ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -122,15 +123,16 @@ define void @fbyte( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func ; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -233,16 +235,16 @@ define void @fhalf( %v) { ; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -250,7 +252,8 @@ define void @fhalf( %v) { ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -264,15 +267,16 @@ define void @fhalf( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func ; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -328,7 +332,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -338,7 +342,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR-NEXT: //NO_APP ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload @@ -385,7 +389,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #2, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -396,7 +400,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #4 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -431,17 +435,17 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_regs() { ; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-3 ; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; PAIR-NEXT: //APP ; PAIR-NEXT: //NO_APP -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #3 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; PAIR-NEXT: ret From 80cd2141eb7f6e7be738a01348bc2ccd08b41cd6 Mon Sep 17 00:00:00 2001 From: Mogball Date: Thu, 5 Sep 2024 14:45:38 +0100 Subject: [PATCH 227/425] [mlir][llvm] Add `externally_initialized` support to GlobalOp This maps the `externally_initialized` flag in `llvm::GlobalVariable` to `GlobalOp` and adds exported support. --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 1 + mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 3 +-- mlir/test/Target/LLVMIR/llvmir.mlir | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 46bf1c9640c17..86f6b3e6326c2 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1122,6 +1122,7 @@ def LLVM_GlobalOp : LLVM_Op<"mlir.global", Linkage:$linkage, UnitAttr:$dso_local, UnitAttr:$thread_local_, + UnitAttr:$externally_initialized, OptionalAttr:$value, OptionalAttr:$alignment, DefaultValuedAttr, "0">:$addr_space, diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index bb23da039e21f..fcb329eb7a92c 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1021,7 +1021,6 @@ LogicalResult ModuleTranslation::convertGlobals() { } auto linkage = convertLinkageToLLVM(op.getLinkage()); - auto addrSpace = op.getAddrSpace(); // LLVM IR requires constant with linkage other than external or weak // external to have initializers. If MLIR does not provide an initializer, @@ -1037,7 +1036,7 @@ LogicalResult ModuleTranslation::convertGlobals() { /*InsertBefore=*/nullptr, op.getThreadLocal_() ? llvm::GlobalValue::GeneralDynamicTLSModel : llvm::GlobalValue::NotThreadLocal, - addrSpace); + op.getAddrSpace(), op.getExternallyInitialized()); if (std::optional comdat = op.getComdat()) { auto selectorOp = cast( diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index df61fef605fde..d2cd0221e0ea7 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -39,6 +39,9 @@ llvm.mlir.global internal constant @string_const("foobar") : !llvm.array<6 x i8> // CHECK: @int_global_undef = internal global i64 undef llvm.mlir.global internal @int_global_undef() : i64 +// CHECK: @externally_initialized_global = internal externally_initialized global i32 0 +llvm.mlir.global internal @externally_initialized_global(0 : i32) {externally_initialized} : i32 + // CHECK: @f8E3M4_global_as_i8 = internal global i8 56 llvm.mlir.global internal @f8E3M4_global_as_i8(1.5 : f8E3M4) : i8 From 0ba78182b975d8ccd8ca42b33fbf038a85a44747 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Sep 2024 14:54:05 +0100 Subject: [PATCH 228/425] [X86] LowerSelect - generalize "select icmp(x,0), lhs, rhs" folding patterns [NFC] (#107374) This patch proposes we add a LowerSELECTWithCmpZero helper, which allows us to fold the compare-with-zero from different condition nodes with minimal duplication. So far I've only handled the simple no-cmov case for or/xor nodes, but the intention is to convert more folds in future PRs. NFC preliminary patch for #107272 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 87 +++++++++++++++---------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 092a7192929fd..5f87ffd2f1eab 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24074,6 +24074,55 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } +// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns. +static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, + unsigned X86CC, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT CmpVT = CmpVal.getValueType(); + EVT VT = LHS.getValueType(); + if (!CmpVT.isScalarInteger() || !VT.isScalarInteger()) + return SDValue(); + + if (!Subtarget.canUseCMOV() && X86CC == X86::COND_E && + CmpVal.getOpcode() == ISD::AND && isOneConstant(CmpVal.getOperand(1))) { + SDValue Src1, Src2; + // true if RHS is XOR or OR operator and one of its operands + // is equal to LHS + // ( a , a op b) || ( b , a op b) + auto isOrXorPattern = [&]() { + if ((RHS.getOpcode() == ISD::XOR || RHS.getOpcode() == ISD::OR) && + (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS)) { + Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0); + Src2 = LHS; + return true; + } + return false; + }; + + if (isOrXorPattern()) { + SDValue Neg; + unsigned int CmpSz = CmpVT.getSizeInBits(); + // we need mask of all zeros or ones with same size of the other + // operands. + if (CmpSz > VT.getSizeInBits()) + Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpVal); + else if (CmpSz < VT.getSizeInBits()) + Neg = DAG.getNode( + ISD::AND, DL, VT, + DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpVal.getOperand(0)), + DAG.getConstant(1, DL, VT)); + else + Neg = CmpVal; + SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1)) + SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z + return DAG.getNode(RHS.getOpcode(), DL, VT, And, Src2); // And Op y + } + } + + return SDValue(); +} + SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; SDValue Cond = Op.getOperand(0); @@ -24218,41 +24267,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Sub.getValue(1)); return DAG.getNode(ISD::OR, DL, VT, SBB, Y); - } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E && - CmpOp0.getOpcode() == ISD::AND && - isOneConstant(CmpOp0.getOperand(1))) { - SDValue Src1, Src2; - // true if Op2 is XOR or OR operator and one of its operands - // is equal to Op1 - // ( a , a op b) || ( b , a op b) - auto isOrXorPattern = [&]() { - if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && - (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { - Src1 = - Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); - Src2 = Op1; - return true; - } - return false; - }; - - if (isOrXorPattern()) { - SDValue Neg; - unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); - // we need mask of all zeros or ones with same size of the other - // operands. - if (CmpSz > VT.getSizeInBits()) - Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); - else if (CmpSz < VT.getSizeInBits()) - Neg = DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), - DAG.getConstant(1, DL, VT)); - else - Neg = CmpOp0; - SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1)) - SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z - return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y - } + } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode, + DL, DAG, Subtarget)) { + return R; } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) && Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) && ((CondCode == X86::COND_S) || // smin(x, 0) From 233ed51cf53d590d3f52d5becff95317dbf73657 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Thu, 5 Sep 2024 15:57:20 +0200 Subject: [PATCH 229/425] [LLD][COFF][NFC] Use is64Bit in Baserel::getDefaultType. (#107378) In preparation for ARM64EC support. Also make it static. --- lld/COFF/Chunks.cpp | 11 +---------- lld/COFF/Chunks.h | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 386012e3ce823..060eb6c32004d 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1000,16 +1000,7 @@ void BaserelChunk::writeTo(uint8_t *buf) const { } uint8_t Baserel::getDefaultType(llvm::COFF::MachineTypes machine) { - switch (machine) { - case AMD64: - case ARM64: - return IMAGE_REL_BASED_DIR64; - case I386: - case ARMNT: - return IMAGE_REL_BASED_HIGHLOW; - default: - llvm_unreachable("unknown machine type"); - } + return is64Bit(machine) ? IMAGE_REL_BASED_DIR64 : IMAGE_REL_BASED_HIGHLOW; } MergeChunk::MergeChunk(uint32_t alignment) diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index 8ccd05b21af7b..30e5b538c352e 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -713,7 +713,7 @@ class Baserel { Baserel(uint32_t v, uint8_t ty) : rva(v), type(ty) {} explicit Baserel(uint32_t v, llvm::COFF::MachineTypes machine) : Baserel(v, getDefaultType(machine)) {} - uint8_t getDefaultType(llvm::COFF::MachineTypes machine); + static uint8_t getDefaultType(llvm::COFF::MachineTypes machine); uint32_t rva; uint8_t type; From 9707b98e572adf34ef3e71bcf159dae08e654fd8 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 16:02:50 +0200 Subject: [PATCH 230/425] [ConstantRange] Perform increment on APInt (NFC) This handles the edge case where BitWidth is 1 and doing the increment gets a value that's not valid in that width, while we just want wrap-around. Split out of https://github.com/llvm/llvm-project/pull/80309. --- llvm/lib/Analysis/ValueTracking.cpp | 2 +- llvm/lib/IR/ConstantRange.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 8f35fd5eb5268..3a0ec99ee5ea1 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9596,7 +9596,7 @@ static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) { case Intrinsic::cttz: // Maximum of set/clear bits is the bit width. return ConstantRange::getNonEmpty(APInt::getZero(Width), - APInt(Width, Width + 1)); + APInt(Width, Width) + 1); case Intrinsic::uadd_sat: // uadd.sat(x, C) produces [C, UINT_MAX]. if (match(II.getOperand(0), m_APInt(C)) || diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index c389d7214defc..61a051821a5db 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -1952,7 +1952,7 @@ ConstantRange ConstantRange::ctlz(bool ZeroIsPoison) const { // Zero is either safe or not in the range. The output range is composed by // the result of countLeadingZero of the two extremes. return getNonEmpty(APInt(getBitWidth(), getUnsignedMax().countl_zero()), - APInt(getBitWidth(), getUnsignedMin().countl_zero() + 1)); + APInt(getBitWidth(), getUnsignedMin().countl_zero()) + 1); } static ConstantRange getUnsignedCountTrailingZerosRange(const APInt &Lower, @@ -2011,7 +2011,7 @@ ConstantRange ConstantRange::cttz(bool ZeroIsPoison) const { } if (isFullSet()) - return getNonEmpty(Zero, APInt(BitWidth, BitWidth + 1)); + return getNonEmpty(Zero, APInt(BitWidth, BitWidth) + 1); if (!isWrappedSet()) return getUnsignedCountTrailingZerosRange(Lower, Upper); // The range is wrapped. We decompose it into two ranges, [0, Upper) and @@ -2056,7 +2056,7 @@ ConstantRange ConstantRange::ctpop() const { unsigned BitWidth = getBitWidth(); APInt Zero = APInt::getZero(BitWidth); if (isFullSet()) - return getNonEmpty(Zero, APInt(BitWidth, BitWidth + 1)); + return getNonEmpty(Zero, APInt(BitWidth, BitWidth) + 1); if (!isWrappedSet()) return getUnsignedPopCountRange(Lower, Upper); // The range is wrapped. We decompose it into two ranges, [0, Upper) and From 9e85efb0dec8e78ca69925a05c0bbba211dee507 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 5 Sep 2024 16:04:52 +0200 Subject: [PATCH 231/425] [ConstantRangeTest] Set APInt signed flags where needed (NFC) Split out from https://github.com/llvm/llvm-project/pull/80309 to avoid assertion failures in the future. --- llvm/unittests/IR/ConstantRangeTest.cpp | 240 ++++++++++++------------ 1 file changed, 124 insertions(+), 116 deletions(-) diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index 4815117458b9a..e1d9b3e387b20 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -372,10 +372,10 @@ TEST_F(ConstantRangeTest, GetMinsAndMaxes) { EXPECT_EQ(Some.getSignedMax(), APInt(16, 0xaa9)); EXPECT_EQ(Wrap.getSignedMax(), APInt(16, INT16_MAX)); - EXPECT_EQ(Full.getSignedMin(), APInt(16, (uint64_t)INT16_MIN)); + EXPECT_EQ(Full.getSignedMin(), APInt(16, (uint16_t)INT16_MIN)); EXPECT_EQ(One.getSignedMin(), APInt(16, 0xa)); EXPECT_EQ(Some.getSignedMin(), APInt(16, 0xa)); - EXPECT_EQ(Wrap.getSignedMin(), APInt(16, (uint64_t)INT16_MIN)); + EXPECT_EQ(Wrap.getSignedMin(), APInt(16, (uint16_t)INT16_MIN)); // Found by Klee EXPECT_EQ(ConstantRange(APInt(4, 7), APInt(4, 0)).getSignedMax(), @@ -487,7 +487,7 @@ TEST_F(ConstantRangeTest, SExt) { APInt(20, INT16_MAX + 1, true))); EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, 140)).signExtend(16), - ConstantRange(APInt(16, -128), APInt(16, 128))); + ConstantRange(APInt(16, -128, true), APInt(16, 128))); EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19), ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000))); @@ -516,7 +516,7 @@ TEST_F(ConstantRangeTest, IntersectWith) { EXPECT_TRUE(LHS.intersectWith(RHS) == LHS); // previous bug: intersection of [min, 3) and [2, max) should be 2 - LHS = ConstantRange(APInt(32, -2147483646), APInt(32, 3)); + LHS = ConstantRange(APInt(32, (uint32_t)-2147483646), APInt(32, 3)); RHS = ConstantRange(APInt(32, 2), APInt(32, 2147483646)); EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 2))); @@ -744,45 +744,51 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) { EXPECT_NE(Some.addWithNoWrap(Full, OBO::NoSignedWrap), Full); EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, 1), APInt(16, 2)), OBO::NoSignedWrap), - ConstantRange(APInt(16, INT16_MIN + 1), APInt(16, INT16_MIN))); + ConstantRange(APInt(16, INT16_MIN + 1, true), + APInt(16, INT16_MIN, true))); EXPECT_EQ(ConstantRange(APInt(16, 1), APInt(16, 2)) .addWithNoWrap(Full, OBO::NoSignedWrap), - ConstantRange(APInt(16, INT16_MIN + 1), APInt(16, INT16_MIN))); - EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, -1), APInt(16, 0)), + ConstantRange(APInt(16, INT16_MIN + 1, true), + APInt(16, INT16_MIN, true))); + EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, -1, true), APInt(16, 0)), OBO::NoSignedWrap), - ConstantRange(APInt(16, INT16_MIN), APInt(16, INT16_MAX))); + ConstantRange(APInt(16, INT16_MIN, true), APInt(16, INT16_MAX))); EXPECT_EQ(ConstantRange(APInt(8, 100), APInt(8, 120)) .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, 123)), OBO::NoSignedWrap), ConstantRange(8, false)); - EXPECT_EQ(ConstantRange(APInt(8, -120), APInt(8, -100)) - .addWithNoWrap(ConstantRange(APInt(8, -110), APInt(8, -100)), - OBO::NoSignedWrap), + EXPECT_EQ(ConstantRange(APInt(8, -120, true), APInt(8, -100, true)) + .addWithNoWrap( + ConstantRange(APInt(8, -110, true), APInt(8, -100, true)), + OBO::NoSignedWrap), ConstantRange(8, false)); - EXPECT_EQ(ConstantRange(APInt(8, 0), APInt(8, 101)) - .addWithNoWrap(ConstantRange(APInt(8, -128), APInt(8, 28)), - OBO::NoSignedWrap), - ConstantRange(8, true)); - EXPECT_EQ(ConstantRange(APInt(8, 0), APInt(8, 101)) - .addWithNoWrap(ConstantRange(APInt(8, -120), APInt(8, 29)), - OBO::NoSignedWrap), - ConstantRange(APInt(8, -120), APInt(8, -128))); - EXPECT_EQ(ConstantRange(APInt(8, -50), APInt(8, 50)) + EXPECT_EQ( + ConstantRange(APInt(8, 0), APInt(8, 101)) + .addWithNoWrap(ConstantRange(APInt(8, -128, true), APInt(8, 28)), + OBO::NoSignedWrap), + ConstantRange(8, true)); + EXPECT_EQ( + ConstantRange(APInt(8, 0), APInt(8, 101)) + .addWithNoWrap(ConstantRange(APInt(8, -120, true), APInt(8, 29)), + OBO::NoSignedWrap), + ConstantRange(APInt(8, -120, true), APInt(8, -128, true))); + EXPECT_EQ(ConstantRange(APInt(8, -50, true), APInt(8, 50)) .addWithNoWrap(ConstantRange(APInt(8, 10), APInt(8, 20)), OBO::NoSignedWrap), - ConstantRange(APInt(8, -40), APInt(8, 69))); + ConstantRange(APInt(8, -40, true), APInt(8, 69))); EXPECT_EQ(ConstantRange(APInt(8, 10), APInt(8, 20)) - .addWithNoWrap(ConstantRange(APInt(8, -50), APInt(8, 50)), + .addWithNoWrap(ConstantRange(APInt(8, -50, true), APInt(8, 50)), OBO::NoSignedWrap), - ConstantRange(APInt(8, -40), APInt(8, 69))); - EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, -10)) + ConstantRange(APInt(8, -40, true), APInt(8, 69))); + EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, -10, true)) .addWithNoWrap(ConstantRange(APInt(8, 5), APInt(8, 20)), OBO::NoSignedWrap), ConstantRange(APInt(8, 125), APInt(8, 9))); - EXPECT_EQ(ConstantRange(APInt(8, 5), APInt(8, 20)) - .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, -10)), - OBO::NoSignedWrap), - ConstantRange(APInt(8, 125), APInt(8, 9))); + EXPECT_EQ( + ConstantRange(APInt(8, 5), APInt(8, 20)) + .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, -10, true)), + OBO::NoSignedWrap), + ConstantRange(APInt(8, 125), APInt(8, 9))); TestBinaryOpExhaustive( [](const ConstantRange &CR1, const ConstantRange &CR2) { @@ -827,15 +833,15 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) { EXPECT_EQ(ConstantRange(APInt(8, 10), APInt(8, 20)) .addWithNoWrap(ConstantRange(APInt(8, 50), APInt(8, 200)), OBO::NoUnsignedWrap), - ConstantRange(APInt(8, 60), APInt(8, -37))); - EXPECT_EQ(ConstantRange(APInt(8, 20), APInt(8, -30)) + ConstantRange(APInt(8, 60), APInt(8, -37, true))); + EXPECT_EQ(ConstantRange(APInt(8, 20), APInt(8, -30, true)) .addWithNoWrap(ConstantRange(APInt(8, 5), APInt(8, 20)), OBO::NoUnsignedWrap), - ConstantRange(APInt(8, 25), APInt(8, -11))); + ConstantRange(APInt(8, 25), APInt(8, -11, true))); EXPECT_EQ(ConstantRange(APInt(8, 5), APInt(8, 20)) - .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, -30)), + .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, -30, true)), OBO::NoUnsignedWrap), - ConstantRange(APInt(8, 25), APInt(8, -11))); + ConstantRange(APInt(8, 25), APInt(8, -11, true))); TestBinaryOpExhaustive( [](const ConstantRange &CR1, const ConstantRange &CR2) { @@ -853,7 +859,7 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) { EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)), OBO::NoSignedWrap), - ConstantRange(APInt(8, 70), APInt(8, -128))); + ConstantRange(APInt(8, 70), APInt(8, -128, true))); EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)), OBO::NoUnsignedWrap), @@ -861,17 +867,17 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) { EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)), OBO::NoUnsignedWrap | OBO::NoSignedWrap), - ConstantRange(APInt(8, 70), APInt(8, -128))); + ConstantRange(APInt(8, 70), APInt(8, -128, true))); - EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50)) + EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)), OBO::NoSignedWrap), - ConstantRange(APInt(8, -80), APInt(8, -21))); - EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50)) + ConstantRange(APInt(8, -80, true), APInt(8, -21, true))); + EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)), OBO::NoUnsignedWrap), ConstantRange(APInt(8, 176), APInt(8, 235))); - EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50)) + EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true)) .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)), OBO::NoUnsignedWrap | OBO::NoSignedWrap), ConstantRange(APInt(8, 176), APInt(8, 235))); @@ -1004,17 +1010,17 @@ TEST_F(ConstantRangeTest, Multiply) { ConstantRange(APInt(8, 250), APInt(8, 253))); // TODO: This should be return [-2, 0] - EXPECT_EQ(ConstantRange(APInt(8, -2)).multiply( - ConstantRange(APInt(8, 0), APInt(8, 2))), - ConstantRange(APInt(8, -2), APInt(8, 1))); + EXPECT_EQ(ConstantRange(APInt(8, -2, true)) + .multiply(ConstantRange(APInt(8, 0), APInt(8, 2))), + ConstantRange(APInt(8, -2, true), APInt(8, 1))); // Multiplication by -1 should give precise results. - EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11)) - .multiply(ConstantRange(APInt(8, -1))), - ConstantRange(APInt(8, 12), APInt(8, -2))); - EXPECT_EQ(ConstantRange(APInt(8, -1)) - .multiply(ConstantRange(APInt(8, 3), APInt(8, -11))), - ConstantRange(APInt(8, 12), APInt(8, -2))); + EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11, true)) + .multiply(ConstantRange(APInt(8, -1, true))), + ConstantRange(APInt(8, 12), APInt(8, -2, true))); + EXPECT_EQ(ConstantRange(APInt(8, -1, true)) + .multiply(ConstantRange(APInt(8, 3), APInt(8, -11, true))), + ConstantRange(APInt(8, 12), APInt(8, -2, true))); TestBinaryOpExhaustive( [](const ConstantRange &CR1, const ConstantRange &CR2) { @@ -1185,11 +1191,11 @@ TEST_F(ConstantRangeTest, SMax) { EXPECT_EQ(Empty.smax(Wrap), Empty); EXPECT_EQ(Empty.smax(One), Empty); EXPECT_EQ(Some.smax(Some), Some); - EXPECT_EQ(Some.smax(Wrap), ConstantRange(APInt(16, 0xa), - APInt(16, (uint64_t)INT16_MIN))); + EXPECT_EQ(Some.smax(Wrap), + ConstantRange(APInt(16, 0xa), APInt(16, (uint16_t)INT16_MIN))); EXPECT_EQ(Some.smax(One), Some); - EXPECT_EQ(Wrap.smax(One), ConstantRange(APInt(16, 0xa), - APInt(16, (uint64_t)INT16_MIN))); + EXPECT_EQ(Wrap.smax(One), + ConstantRange(APInt(16, 0xa), APInt(16, (uint16_t)INT16_MIN))); EXPECT_EQ(One.smax(One), One); TestBinaryOpExhaustive( @@ -1231,20 +1237,20 @@ TEST_F(ConstantRangeTest, UMin) { TEST_F(ConstantRangeTest, SMin) { EXPECT_EQ(Full.smin(Full), Full); EXPECT_EQ(Full.smin(Empty), Empty); - EXPECT_EQ(Full.smin(Some), ConstantRange(APInt(16, (uint64_t)INT16_MIN), - APInt(16, 0xaaa))); + EXPECT_EQ(Full.smin(Some), + ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xaaa))); EXPECT_EQ(Full.smin(Wrap), Full); EXPECT_EQ(Empty.smin(Empty), Empty); EXPECT_EQ(Empty.smin(Some), Empty); EXPECT_EQ(Empty.smin(Wrap), Empty); EXPECT_EQ(Empty.smin(One), Empty); EXPECT_EQ(Some.smin(Some), Some); - EXPECT_EQ(Some.smin(Wrap), ConstantRange(APInt(16, (uint64_t)INT16_MIN), - APInt(16, 0xaaa))); + EXPECT_EQ(Some.smin(Wrap), + ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xaaa))); EXPECT_EQ(Some.smin(One), One); EXPECT_EQ(Wrap.smin(Wrap), Wrap); - EXPECT_EQ(Wrap.smin(One), ConstantRange(APInt(16, (uint64_t)INT16_MIN), - APInt(16, 0xb))); + EXPECT_EQ(Wrap.smin(One), + ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xb))); EXPECT_EQ(One.smin(One), One); TestBinaryOpExhaustive( @@ -1320,8 +1326,8 @@ TEST_F(ConstantRangeTest, SDiv) { } // If there is a non-full signed envelope, that should be the result. - APInt SMin(Bits, Results.find_first() - Bias); - APInt SMax(Bits, Results.find_last() - Bias); + APInt SMin(Bits, Results.find_first() - Bias, true); + APInt SMax(Bits, Results.find_last() - Bias, true); ConstantRange Envelope = ConstantRange::getNonEmpty(SMin, SMax + 1); if (!Envelope.isFullSet()) { EXPECT_EQ(Envelope, CR); @@ -1340,8 +1346,8 @@ TEST_F(ConstantRangeTest, SDiv) { --LastPos; } - APInt WMax(Bits, LastNeg); - APInt WMin(Bits, LastPos); + APInt WMax(Bits, LastNeg, true); + APInt WMin(Bits, LastPos, true); ConstantRange Wrapped = ConstantRange::getNonEmpty(WMin, WMax + 1); EXPECT_EQ(Wrapped, CR); }); @@ -1394,8 +1400,8 @@ TEST_F(ConstantRangeTest, SRem) { EXPECT_EQ(Full.srem(Full), ConstantRange(APInt::getSignedMinValue(16) + 1, APInt::getSignedMinValue(16))); - ConstantRange PosMod(APInt(16, 10), APInt(16, 21)); // [10, 20] - ConstantRange NegMod(APInt(16, -20), APInt(16, -9)); // [-20, -10] + ConstantRange PosMod(APInt(16, 10), APInt(16, 21)); // [10, 20] + ConstantRange NegMod(APInt(16, -20, true), APInt(16, -9, true)); // [-20, -10] ConstantRange IntMinMod(APInt::getSignedMinValue(16)); ConstantRange Expected(16, true); @@ -1405,12 +1411,12 @@ TEST_F(ConstantRangeTest, SRem) { Expected = ConstantRange(APInt(16, 0), APInt(16, 20)); EXPECT_EQ(PosLargeLHS.srem(PosMod), Expected); EXPECT_EQ(PosLargeLHS.srem(NegMod), Expected); - ConstantRange NegLargeLHS(APInt(16, -40), APInt(16, 1)); - Expected = ConstantRange(APInt(16, -19), APInt(16, 1)); + ConstantRange NegLargeLHS(APInt(16, -40, true), APInt(16, 1)); + Expected = ConstantRange(APInt(16, -19, true), APInt(16, 1)); EXPECT_EQ(NegLargeLHS.srem(PosMod), Expected); EXPECT_EQ(NegLargeLHS.srem(NegMod), Expected); - ConstantRange PosNegLargeLHS(APInt(16, -32), APInt(16, 38)); - Expected = ConstantRange(APInt(16, -19), APInt(16, 20)); + ConstantRange PosNegLargeLHS(APInt(16, -32, true), APInt(16, 38)); + Expected = ConstantRange(APInt(16, -19, true), APInt(16, 20)); EXPECT_EQ(PosNegLargeLHS.srem(PosMod), Expected); EXPECT_EQ(PosNegLargeLHS.srem(NegMod), Expected); @@ -1419,11 +1425,11 @@ TEST_F(ConstantRangeTest, SRem) { EXPECT_EQ(PosLHS.srem(PosMod), PosLHS); EXPECT_EQ(PosLHS.srem(NegMod), PosLHS); EXPECT_EQ(PosLHS.srem(IntMinMod), PosLHS); - ConstantRange NegLHS(APInt(16, -15), APInt(16, 1)); + ConstantRange NegLHS(APInt(16, -15, true), APInt(16, 1)); EXPECT_EQ(NegLHS.srem(PosMod), NegLHS); EXPECT_EQ(NegLHS.srem(NegMod), NegLHS); EXPECT_EQ(NegLHS.srem(IntMinMod), NegLHS); - ConstantRange PosNegLHS(APInt(16, -12), APInt(16, 18)); + ConstantRange PosNegLHS(APInt(16, -12, true), APInt(16, 18)); EXPECT_EQ(PosNegLHS.srem(PosMod), PosNegLHS); EXPECT_EQ(PosNegLHS.srem(NegMod), PosNegLHS); EXPECT_EQ(PosNegLHS.srem(IntMinMod), PosNegLHS); @@ -1433,11 +1439,11 @@ TEST_F(ConstantRangeTest, SRem) { EXPECT_EQ(PosSmallLHS.srem(PosMod), PosSmallLHS); EXPECT_EQ(PosSmallLHS.srem(NegMod), PosSmallLHS); EXPECT_EQ(PosSmallLHS.srem(IntMinMod), PosSmallLHS); - ConstantRange NegSmallLHS(APInt(16, -7), APInt(16, -2)); + ConstantRange NegSmallLHS(APInt(16, -7, true), APInt(16, -2, true)); EXPECT_EQ(NegSmallLHS.srem(PosMod), NegSmallLHS); EXPECT_EQ(NegSmallLHS.srem(NegMod), NegSmallLHS); EXPECT_EQ(NegSmallLHS.srem(IntMinMod), NegSmallLHS); - ConstantRange PosNegSmallLHS(APInt(16, -3), APInt(16, 8)); + ConstantRange PosNegSmallLHS(APInt(16, -3, true), APInt(16, 8)); EXPECT_EQ(PosNegSmallLHS.srem(PosMod), PosNegSmallLHS); EXPECT_EQ(PosNegSmallLHS.srem(NegMod), PosNegSmallLHS); EXPECT_EQ(PosNegSmallLHS.srem(IntMinMod), PosNegSmallLHS); @@ -1554,27 +1560,27 @@ TEST_F(ConstantRangeTest, ShlWithNoWrap) { EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoSignedWrap), ConstantRange(APInt(16, 10), APInt(16, 20481))); EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoUnsignedWrap), - ConstantRange(APInt(16, 10), APInt(16, -24575))); + ConstantRange(APInt(16, 10), APInt(16, -24575, true))); EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoSignedWrap | OBO::NoUnsignedWrap), ConstantRange(APInt(16, 10), APInt(16, 20481))); ConstantRange NegOne(APInt(16, 0xffff)); EXPECT_EQ(NegOne.shlWithNoWrap(Full, OBO::NoSignedWrap), - ConstantRange(APInt(16, -32768), APInt(16, 0))); + ConstantRange(APInt(16, -32768, true), APInt(16, 0))); EXPECT_EQ(NegOne.shlWithNoWrap(Full, OBO::NoUnsignedWrap), NegOne); EXPECT_EQ(ConstantRange(APInt(16, 768)) .shlWithNoWrap(Full, OBO::NoSignedWrap | OBO::NoUnsignedWrap), ConstantRange(APInt(16, 768), APInt(16, 24577))); EXPECT_EQ(Full.shlWithNoWrap(ConstantRange(APInt(16, 1), APInt(16, 16)), OBO::NoUnsignedWrap), - ConstantRange(APInt(16, 0), APInt(16, -1))); - EXPECT_EQ(ConstantRange(APInt(4, 3), APInt(4, -8)) + ConstantRange(APInt(16, 0), APInt(16, -1, true))); + EXPECT_EQ(ConstantRange(APInt(4, 3), APInt(4, -8, true)) .shlWithNoWrap(ConstantRange(APInt(4, 0), APInt(4, 4)), OBO::NoSignedWrap), - ConstantRange(APInt(4, 3), APInt(4, -8))); - EXPECT_EQ(ConstantRange(APInt(4, -1), APInt(4, 0)) + ConstantRange(APInt(4, 3), APInt(4, -8, true))); + EXPECT_EQ(ConstantRange(APInt(4, -1, true), APInt(4, 0)) .shlWithNoWrap(ConstantRange(APInt(4, 1), APInt(4, 4)), OBO::NoSignedWrap), - ConstantRange(APInt(4, -8), APInt(4, -1))); + ConstantRange(APInt(4, -8, true), APInt(4, -1, true))); } TEST_F(ConstantRangeTest, Lshr) { @@ -1620,9 +1626,9 @@ TEST_F(ConstantRangeTest, Ashr) { APInt(16, (0xaaa >> 0xa) + 1))); EXPECT_EQ(Some.ashr(Wrap), ConstantRange(APInt(16, 0), APInt(16, 0xaaa))); EXPECT_EQ(Wrap.ashr(Wrap), Full); - ConstantRange Neg(APInt(16, 0xf3f0, true), APInt(16, 0xf7f8, true)); - EXPECT_EQ(Neg.ashr(Small), ConstantRange(APInt(16, 0xfffc, true), - APInt(16, 0xfffe, true))); + ConstantRange Neg(APInt(16, 0xf3f0), APInt(16, 0xf7f8)); + EXPECT_EQ(Neg.ashr(Small), + ConstantRange(APInt(16, 0xfffc), APInt(16, 0xfffe))); } TEST(ConstantRange, MakeAllowedICmpRegion) { @@ -1665,23 +1671,23 @@ TEST(ConstantRange, MakeSatisfyingICmpRegion) { UnsignedSample), ConstantRange(APInt(8, 199), APInt(8, 0))); - ConstantRange SignedSample(APInt(8, -5), APInt(8, 5)); + ConstantRange SignedSample(APInt(8, -5, true), APInt(8, 5)); EXPECT_EQ( ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SLT, SignedSample), - ConstantRange(APInt(8, -128), APInt(8, -5))); + ConstantRange(APInt(8, -128, true), APInt(8, -5, true))); EXPECT_EQ( ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SLE, SignedSample), - ConstantRange(APInt(8, -128), APInt(8, -4))); + ConstantRange(APInt(8, -128, true), APInt(8, -4, true))); EXPECT_EQ( ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SGT, SignedSample), - ConstantRange(APInt(8, 5), APInt(8, -128))); + ConstantRange(APInt(8, 5), APInt(8, -128, true))); EXPECT_EQ( ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SGE, SignedSample), - ConstantRange(APInt(8, 4), APInt(8, -128))); + ConstantRange(APInt(8, 4), APInt(8, -128, true))); } void ICmpTestImpl(CmpInst::Predicate Pred) { @@ -1703,7 +1709,7 @@ TEST(ConstantRange, ICmp) { } TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { - const int IntMin4Bits = 8; + const int IntMin4Bits = -8; const int IntMax4Bits = 7; typedef OverflowingBinaryOperator OBO; @@ -1812,7 +1818,7 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { Instruction::Sub, OneToFive, OBO::NoUnsignedWrap), ConstantRange(APInt::getMinValue(32) + 5, APInt::getMinValue(32))); - ConstantRange MinusFiveToMinusTwo(APInt(32, -5), APInt(32, -1)); + ConstantRange MinusFiveToMinusTwo(APInt(32, -5, true), APInt(32, -1, true)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, MinusFiveToMinusTwo, OBO::NoSignedWrap), ConstantRange(APInt::getSignedMinValue(32) + 5, @@ -1826,10 +1832,9 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { APInt::getSignedMaxValue(32) - 4)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Sub, MinusFiveToMinusTwo, OBO::NoUnsignedWrap), - ConstantRange(APInt::getMaxValue(32) - 1, - APInt::getMinValue(32))); + ConstantRange(APInt::getMaxValue(32) - 1, APInt::getMinValue(32))); - ConstantRange MinusOneToOne(APInt(32, -1), APInt(32, 2)); + ConstantRange MinusOneToOne(APInt(32, -1, true), APInt(32, 2)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, MinusOneToOne, OBO::NoSignedWrap), ConstantRange(APInt::getSignedMinValue(32) + 1, @@ -1877,7 +1882,7 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { ConstantRange(APInt(32, 0), APInt(32, 1) + 1)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Shl, UpToBitWidth, OBO::NoSignedWrap), - ConstantRange(APInt(32, -1), APInt(32, 0) + 1)); + ConstantRange(APInt(32, -1, true), APInt(32, 0) + 1)); EXPECT_EQ( ConstantRange::makeGuaranteedNoWrapRegion( @@ -1898,34 +1903,36 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { Instruction::Shl, IllegalShAmt, OBO::NoSignedWrap), ConstantRange::getFull(32)); - EXPECT_EQ( - ConstantRange::makeGuaranteedNoWrapRegion( - Instruction::Shl, ConstantRange(APInt(32, -32), APInt(32, 16) + 1), - OBO::NoUnsignedWrap), - ConstantRange::makeGuaranteedNoWrapRegion( - Instruction::Shl, ConstantRange(APInt(32, 0), APInt(32, 16) + 1), - OBO::NoUnsignedWrap)); - EXPECT_EQ( - ConstantRange::makeGuaranteedNoWrapRegion( - Instruction::Shl, ConstantRange(APInt(32, -32), APInt(32, 16) + 1), - OBO::NoSignedWrap), - ConstantRange::makeGuaranteedNoWrapRegion( - Instruction::Shl, ConstantRange(APInt(32, 0), APInt(32, 16) + 1), - OBO::NoSignedWrap)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Shl, + ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1), + OBO::NoUnsignedWrap), + ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Shl, + ConstantRange(APInt(32, 0), APInt(32, 16) + 1), + OBO::NoUnsignedWrap)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Shl, + ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1), + OBO::NoSignedWrap), + ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Shl, + ConstantRange(APInt(32, 0), APInt(32, 16) + 1), + OBO::NoSignedWrap)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Shl, - ConstantRange(APInt(32, -32), APInt(32, 16) + 1), + ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1), OBO::NoUnsignedWrap), ConstantRange(APInt(32, 0), APInt(32, 65535) + 1)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Shl, - ConstantRange(APInt(32, -32), APInt(32, 16) + 1), + ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1), OBO::NoSignedWrap), - ConstantRange(APInt(32, -32768), APInt(32, 32767) + 1)); + ConstantRange(APInt(32, -32768, true), APInt(32, 32767) + 1)); } -template +template void TestNoWrapRegionExhaustive(Instruction::BinaryOps BinOp, unsigned NoWrapKind, Fn OverflowFn) { for (unsigned Bits : {1, 5}) { @@ -2090,14 +2097,15 @@ TEST(ConstantRange, GetEquivalentICmp) { EXPECT_EQ(Pred, CmpInst::ICMP_NE); EXPECT_EQ(RHS, APInt(32, 0)); - EXPECT_TRUE(ConstantRange(APInt(32, -1)).getEquivalentICmp(Pred, RHS)); + EXPECT_TRUE(ConstantRange(APInt(32, -1, true)).getEquivalentICmp(Pred, RHS)); EXPECT_EQ(Pred, CmpInst::ICMP_EQ); - EXPECT_EQ(RHS, APInt(32, -1)); + EXPECT_EQ(RHS, APInt(32, -1, true)); - EXPECT_TRUE( - ConstantRange(APInt(32, -1)).inverse().getEquivalentICmp(Pred, RHS)); + EXPECT_TRUE(ConstantRange(APInt(32, -1, true)) + .inverse() + .getEquivalentICmp(Pred, RHS)); EXPECT_EQ(Pred, CmpInst::ICMP_NE); - EXPECT_EQ(RHS, APInt(32, -1)); + EXPECT_EQ(RHS, APInt(32, -1, true)); EnumerateInterestingConstantRanges([](const ConstantRange &CR) { unsigned Bits = CR.getBitWidth(); From 5024dff6eee5a95a741b063c953422c5b6d02fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 5 Sep 2024 17:25:41 +0300 Subject: [PATCH 232/425] [libc++][ci] Add a test configuration with an incomplete sysroot (#107089) When bringing up a new cross compiler from scratch, we build libunwind/libcxx in a setup where the toolchain is incomplete and unable to perform the normal linker checks; this requires a few special cases in the CMake files. We simulate that scenario by removing the libc++ headers, libunwind and libc++ libraries from the installed toolchain. We need to set CMAKE_CXX_COMPILER_WORKS since CMake fails to probe the compiler. We need to set CMAKE_CXX_COMPILER_TARGET, since LLVM's heuristics fail when CMake hasn't been able to probe the environment properly. (This is normal; one has to set those options when setting up such a toolchain from scratch.) This adds CI coverage for these build scenarios, which otherwise seldom are tested by some build flow (but are essential when setting up a cross compiler from scratch). --- .github/workflows/libcxx-build-and-test.yaml | 7 ++++++ libcxx/utils/ci/run-buildbot | 23 ++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 1a26a699db8e0..b5e60781e0006 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -242,6 +242,7 @@ jobs: - { config: mingw-dll, mingw: true } - { config: mingw-static, mingw: true } - { config: mingw-dll-i686, mingw: true } + - { config: mingw-incomplete-sysroot, mingw: true } steps: - uses: actions/checkout@v4 - name: Install dependencies @@ -260,6 +261,12 @@ jobs: del llvm-mingw*.zip mv llvm-mingw* c:\llvm-mingw echo "c:\llvm-mingw\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append + - name: Simulate a from-scratch build of llvm-mingw + if: ${{ matrix.config == 'mingw-incomplete-sysroot' }} + run: | + rm -r c:\llvm-mingw\include\c++ + rm -r c:\llvm-mingw\*-w64-mingw32\lib\libc++* + rm -r c:\llvm-mingw\*-w64-mingw32\lib\libunwind* - name: Add Git Bash to the path run: | echo "c:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 14ff611302981..b0533cb9a49c9 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -715,6 +715,29 @@ mingw-dll-i686) -C "${MONOREPO_ROOT}/libcxx/cmake/caches/MinGW.cmake" check-runtimes ;; +mingw-incomplete-sysroot) + # When bringing up a new cross compiler from scratch, we build + # libunwind/libcxx in a setup where the toolchain is incomplete and + # unable to perform the normal linker checks; this requires a few + # special cases in the CMake files. + # + # Building in an incomplete setup requires setting CMAKE_*_COMPILER_WORKS, + # as CMake fails to probe the compiler. This case also requires + # setting CMAKE_CXX_COMPILER_TARGET, as LLVM's heuristics for setting + # the triple fails when CMake hasn't been able to probe the environment. + # (This is what one has to do when building the initial libunwind/libcxx + # for a new toolchain.) + clean + generate-cmake \ + -DCMAKE_C_COMPILER_WORKS=TRUE \ + -DCMAKE_CXX_COMPILER_WORKS=TRUE \ + -DCMAKE_C_COMPILER_TARGET=x86_64-w64-windows-gnu \ + -DCMAKE_CXX_COMPILER_TARGET=x86_64-w64-windows-gnu \ + -C "${MONOREPO_ROOT}/libcxx/cmake/caches/MinGW.cmake" + # Only test that building succeeds; there's not much extra value in running + # the tests here, as it would be equivalent to the mingw-dll config above. + ${NINJA} -vC "${BUILD_DIR}" +;; aix) clean generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/AIX.cmake" \ From 16dc65bdc0f0a23bc2696afce2abecd9f2faa097 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 5 Sep 2024 07:33:55 -0700 Subject: [PATCH 233/425] [mlgo] Fix test post - #106744 Trivial fix, some instruction opcodes changed. --- llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll index 79be2447abcff..31e120007cb7d 100644 --- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll +++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll @@ -26,7 +26,7 @@ ; Also, the first eviction problem is significantly less than 300 instructions. Check ; that there is a zero value. ; Note: we're regex-ing some of the opcodes to avoid test flakyness. -; CHECK: instructions: 19,{{([0-9]{4})}},1{{([0-9]{3})}},1{{([0-9]{3})}},{{.*}},0, +; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},1{{([0-9]{3})}},{{.*}},0, ; Only the candidate virtreg and the 10th LR are included in this problem. Make ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s. ; There's a limit to how many repetitions can be matched. From bded3b3ea9f78c5b3edc3d4a6076665af0ea746b Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Thu, 5 Sep 2024 07:42:23 -0700 Subject: [PATCH 234/425] [llvm][AArch64] Improve the cost model for i128 div's (#107306) --- .../AArch64/AArch64TargetTransformInfo.cpp | 8 +++- llvm/test/Analysis/CostModel/AArch64/div.ll | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 58c267f1ce4bd..2b5deaff39719 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3224,8 +3224,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( } [[fallthrough]]; case ISD::UDIV: { + auto VT = TLI->getValueType(DL, Ty); if (Op2Info.isConstant() && Op2Info.isUniform()) { - auto VT = TLI->getValueType(DL, Ty); if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { // Vector signed division by constant are expanded to the // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division @@ -3240,6 +3240,12 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( } } + // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are + // emitted by the backend even when those functions are not declared in the + // module. + if (!VT.isVector() && VT.getSizeInBits() > 64) + return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); + InstructionCost Cost = BaseT::getArithmeticInstrCost( Opcode, Ty, CostKind, Op1Info, Op2Info); if (Ty->isVectorTy()) { diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll index 7b52e2dc18aa5..2ceaf0c6f536a 100644 --- a/llvm/test/Analysis/CostModel/AArch64/div.ll +++ b/llvm/test/Analysis/CostModel/AArch64/div.ll @@ -5,6 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i32 @sdiv() { ; CHECK-LABEL: 'sdiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef @@ -23,6 +24,8 @@ define i32 @sdiv() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, undef + %I64 = sdiv i64 undef, undef %V2i64 = sdiv <2 x i64> undef, undef %V4i64 = sdiv <4 x i64> undef, undef @@ -48,6 +51,7 @@ define i32 @sdiv() { define i32 @udiv() { ; CHECK-LABEL: 'udiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef @@ -66,6 +70,8 @@ define i32 @udiv() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, undef + %I64 = udiv i64 undef, undef %V2i64 = udiv <2 x i64> undef, undef %V4i64 = udiv <4 x i64> undef, undef @@ -91,6 +97,7 @@ define i32 @udiv() { define i32 @sdiv_const() { ; CHECK-LABEL: 'sdiv_const' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -109,6 +116,8 @@ define i32 @sdiv_const() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, 7 + %I64 = sdiv i64 undef, 7 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -134,6 +143,7 @@ define i32 @sdiv_const() { define i32 @udiv_const() { ; CHECK-LABEL: 'udiv_const' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -152,6 +162,9 @@ define i32 @udiv_const() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + + %I128 = udiv i128 undef, 7 + %I64 = udiv i64 undef, 7 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, @@ -177,6 +190,7 @@ define i32 @udiv_const() { define i32 @sdiv_uniformconst() { ; CHECK-LABEL: 'sdiv_uniformconst' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -195,6 +209,8 @@ define i32 @sdiv_uniformconst() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, 7 + %I64 = sdiv i64 undef, 7 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -220,6 +236,7 @@ define i32 @sdiv_uniformconst() { define i32 @udiv_uniformconst() { ; CHECK-LABEL: 'udiv_uniformconst' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -238,6 +255,8 @@ define i32 @udiv_uniformconst() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, 7 + %I64 = udiv i64 undef, 7 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, @@ -263,6 +282,7 @@ define i32 @udiv_uniformconst() { define i32 @sdiv_constpow2() { ; CHECK-LABEL: 'sdiv_constpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -281,6 +301,8 @@ define i32 @sdiv_constpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, 16 + %I64 = sdiv i64 undef, 16 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -306,6 +328,7 @@ define i32 @sdiv_constpow2() { define i32 @udiv_constpow2() { ; CHECK-LABEL: 'udiv_constpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -324,6 +347,8 @@ define i32 @udiv_constpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, 16 + %I64 = udiv i64 undef, 16 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, @@ -349,6 +374,7 @@ define i32 @udiv_constpow2() { define i32 @sdiv_uniformconstpow2() { ; CHECK-LABEL: 'sdiv_uniformconstpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -367,6 +393,8 @@ define i32 @sdiv_uniformconstpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, 16 + %I64 = sdiv i64 undef, 16 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -392,6 +420,7 @@ define i32 @sdiv_uniformconstpow2() { define i32 @udiv_uniformconstpow2() { ; CHECK-LABEL: 'udiv_uniformconstpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -410,6 +439,8 @@ define i32 @udiv_uniformconstpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, 16 + %I64 = udiv i64 undef, 16 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, @@ -435,6 +466,7 @@ define i32 @udiv_uniformconstpow2() { define i32 @sdiv_constnegpow2() { ; CHECK-LABEL: 'sdiv_constnegpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -453,6 +485,8 @@ define i32 @sdiv_constnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, -16 + %I64 = sdiv i64 undef, -16 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -478,6 +512,7 @@ define i32 @sdiv_constnegpow2() { define i32 @udiv_constnegpow2() { ; CHECK-LABEL: 'udiv_constnegpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -496,6 +531,8 @@ define i32 @udiv_constnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, -16 + %I64 = udiv i64 undef, -16 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, @@ -521,6 +558,7 @@ define i32 @udiv_constnegpow2() { define i32 @sdiv_uniformconstnegpow2() { ; CHECK-LABEL: 'sdiv_uniformconstnegpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = sdiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, @@ -539,6 +577,8 @@ define i32 @sdiv_uniformconstnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = sdiv i128 undef, -16 + %I64 = sdiv i64 undef, -16 %V2i64 = sdiv <2 x i64> undef, %V4i64 = sdiv <4 x i64> undef, @@ -564,6 +604,7 @@ define i32 @sdiv_uniformconstnegpow2() { define i32 @udiv_uniformconstnegpow2() { ; CHECK-LABEL: 'udiv_uniformconstnegpow2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, @@ -582,6 +623,8 @@ define i32 @udiv_uniformconstnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %I128 = udiv i128 undef, -16 + %I64 = udiv i64 undef, -16 %V2i64 = udiv <2 x i64> undef, %V4i64 = udiv <4 x i64> undef, From 63e8a1b16f344eaef17c4015497326479e69d1e7 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 5 Sep 2024 07:52:27 -0700 Subject: [PATCH 235/425] [SLP] Enable reordering for non-power-of-two vectors (#106638) This change tries to enable vector reordering during vectorization for non-power-of-two vectors. Specifically, my goal is to be able to vectorize reductions whose operands appear in other than identity order. (i.e. a[1] + a[0] + a[2]). Our standard pass pipeline, Reassociation effectively canonicalizes towards this form. So for reduction vectorization to be wildly applicable, we need this feature. This change enables the use of a non-empty ReorderIndices structure - which is effectively required for out of order loads or gathers - while leaving the ReuseShuffleIndices mechanism unused and disabled. If I've understood the code structure, the former is used when describing implicit shuffles required by the vectorization strategy (i.e. loading elements 0,1,3,2 in the order 0,1,2,3 and then shuffling later), while the later is used when trying to optimize explode/buildvectors (called gathers in this code). I audited all the code enabled by this change, but can't claim to deeply understand most of it. I added a couple of bailouts in places which appeared to be difficult to audit and optional optimizations. I've tried to do so in the least risky way I can, but am not completely confident in this change. Careful review appreciated. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 48 +++++----- .../AArch64/vec3-reorder-reshuffle.ll | 15 ++-- .../SLPVectorizer/RISCV/vec3-base.ll | 88 +++++++++++-------- .../X86/vec3-reorder-reshuffle.ll | 9 +- 4 files changed, 89 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 60476398e5ca7..74bb529b2526e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3388,6 +3388,10 @@ class BoUpSLP { TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->State = EntryState; + // FIXME: Remove once support for ReuseShuffleIndices has been implemented + // for non-power-of-two vectors. + assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) && + "Reshuffling scalars not yet supported for nodes with padding"); Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { @@ -3452,11 +3456,8 @@ class BoUpSLP { MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) { + if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && - "Reordering isn't implemented for non-power-of-2 nodes yet"); - } return Last; } @@ -4731,12 +4732,6 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( auto *VecTy = getWidenedType(ScalarTy, Sz); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (!Order.empty() && !has_single_bit(VL.size())) { - assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " - "supported with VectorizeNonPowerOf2"); - return LoadsState::Gather; - } Align CommonAlignment = computeCommonAlignment(VL); if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) && @@ -4824,6 +4819,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // representation is better than just gather. auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, bool ProfitableGatherPointers) { + // FIXME: The following code has not been updated for non-power-of-2 + // vectors. The splitting logic here does not cover the original + // vector if the vector factor is not a power of two. FIXME + if (!has_single_bit(VL.size())) + return false; + // Compare masked gather cost and loads + insert subvector costs. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto [ScalarGEPCost, VectorGEPCost] = @@ -5195,13 +5196,13 @@ static bool areTwoInsertFromSameBuildVector( std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { - // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (TE.isNonPowOf2Vec()) - return std::nullopt; - // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { + // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. + assert(!TE.isNonPowOf2Vec() && + "Reshuffling scalars not yet supported for nodes with padding"); + if (isSplat(TE.Scalars)) return std::nullopt; // Check if reuse shuffle indices can be improved by reordering. @@ -5424,11 +5425,15 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } if (isSplat(TE.Scalars)) return std::nullopt; - if (TE.Scalars.size() >= 4) + if (TE.Scalars.size() >= 3) if (std::optional Order = findPartiallyOrderedLoads(TE)) return Order; - if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) - return CurrentOrder; + + // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars + // has been auditted for correctness with non-power-of-two vectors. + if (!TE.isNonPowOf2Vec()) + if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; } return std::nullopt; } @@ -5580,7 +5585,7 @@ void BoUpSLP::reorderTopToBottom() { // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; - VF /= 2) { + VF = bit_ceil(VF) / 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; @@ -5752,10 +5757,6 @@ bool BoUpSLP::canReorderOperands( TreeEntry *UserTE, SmallVectorImpl> &Edges, ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (UserTE->isNonPowOf2Vec()) - return false; - for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && @@ -5927,9 +5928,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto AllowsReordering = [&](const TreeEntry *TE) { - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (TE->isNonPowOf2Vec()) - return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || (IgnoreReorder && TE->Idx == 0)) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 9bbd314a27cb9..c9b2e0ffc15f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -191,12 +191,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 @@ -263,7 +263,8 @@ define void @reorder_indices_2(ptr %spoint) { ; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[DSCO]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2( @@ -566,11 +567,11 @@ define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) { ; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding( ; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[TMP0]], [[TMP0]] +; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[IN]], [[IN]] ; NON-POW2-NEXT: [[TMP2:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> , <3 x float> ) ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <3 x float> [[TMP2]], -; NON-POW2-NEXT: store <3 x float> [[TMP3]], ptr [[A]], align 4 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP4]], ptr [[A]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index d9e5655f4b4e0..4e8e019e155db 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -557,25 +557,34 @@ define i32 @dot_product_i32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_i32_reorder( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 -; CHECK-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 -; CHECK-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret i32 [[ADD_1]] +; NON-POW2-LABEL: @dot_product_i32_reorder( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) +; NON-POW2-NEXT: ret i32 [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_i32_reorder( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 +; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 +; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] +; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 %l.a.0 = load i32, ptr %gep.a.0, align 4 @@ -653,22 +662,31 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define float @dot_product_fp32_reorder(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_fp32_reorder( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret float [[ADD_1]] +; NON-POW2-LABEL: @dot_product_fp32_reorder( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) +; NON-POW2-NEXT: ret float [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_fp32_reorder( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 1399b4c35c781..22a59d3da52a6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -190,12 +190,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 @@ -262,7 +262,8 @@ define void @reorder_indices_2(ptr %spoint) { ; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[DSCO]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2( From 3b19e480c056a35a60e3c65de476b6097329ceac Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 5 Sep 2024 15:59:36 +0100 Subject: [PATCH 236/425] [flang] Warn when F128 is unsupported (#102147) (#106957) This generates `warning: REAL(KIND=16) is not an enabled type for this target` if that type is used in a build not correctly configured to support this type. Uses of `selected_real_kind(30)` return -1. Relanding #102147 because the test errors turned out to be specific to a downstream configuration. --- flang/include/flang/Tools/TargetSetup.h | 21 +++++++- flang/module/ieee_arithmetic.f90 | 49 +++++++++++++++++++ flang/test/CMakeLists.txt | 10 ++++ flang/test/Evaluate/fold-out_of_range.f90 | 1 + flang/test/Evaluate/folding07.f90 | 1 + .../Lower/Intrinsics/ieee_class_queries.f90 | 1 + .../test/Lower/Intrinsics/ieee_unordered.f90 | 1 + flang/test/Lower/common-block.f90 | 2 +- flang/test/Semantics/kinds03.f90 | 1 + flang/test/Semantics/modfile26.f90 | 1 + flang/test/Semantics/realkinds-aarch64-01.f90 | 1 + flang/test/lit.cfg.py | 3 +- flang/test/lit.site.cfg.py.in | 1 + flang/tools/f18/CMakeLists.txt | 25 +++++++++- 14 files changed, 114 insertions(+), 4 deletions(-) diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index 37c1e1d2ff63f..ee89249441c17 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -12,6 +12,7 @@ #include "flang/Evaluate/target.h" #include "flang/Frontend/TargetOptions.h" #include "llvm/Target/TargetMachine.h" +#include namespace Fortran::tools { @@ -23,9 +24,27 @@ namespace Fortran::tools { const llvm::Triple &targetTriple{targetMachine.getTargetTriple()}; // FIXME: Handle real(3) ? - if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) + if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) { targetCharacteristics.DisableType( Fortran::common::TypeCategory::Real, /*kind=*/10); + } + + // Figure out if we can support F128: see + // flang/runtime/Float128Math/math-entries.h + // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support + // cross-compilation +#ifdef FLANG_RUNTIME_F128_MATH_LIB + // we can use libquadmath wrappers + constexpr bool f128Support = true; +#elif LDBL_MANT_DIG == 113 + // we can use libm wrappers + constexpr bool f128Support = true; +#else + constexpr bool f128Support = false; +#endif + + if constexpr (!f128Support) + targetCharacteristics.DisableType(Fortran::common::TypeCategory::Real, 16); for (auto realKind : targetOptions.disabledRealKinds) targetCharacteristics.DisableType(common::TypeCategory::Real, realKind); diff --git a/flang/module/ieee_arithmetic.f90 b/flang/module/ieee_arithmetic.f90 index 7c7721d78c1ed..32e640b9e2457 100644 --- a/flang/module/ieee_arithmetic.f90 +++ b/flang/module/ieee_arithmetic.f90 @@ -161,6 +161,8 @@ end function ieee_round_ne G(1) G(2) G(4) G(8) G(16) #define SPECIFICS_L(G) \ G(1) G(2) G(4) G(8) + +#if FLANG_SUPPORT_R16 #if __x86_64__ #define SPECIFICS_R(G) \ G(2) G(3) G(4) G(8) G(10) G(16) @@ -168,12 +170,24 @@ end function ieee_round_ne #define SPECIFICS_R(G) \ G(2) G(3) G(4) G(8) G(16) #endif +#else +#if __x86_64__ +#define SPECIFICS_R(G) \ + G(2) G(3) G(4) G(8) G(10) +#else +#define SPECIFICS_R(G) \ + G(2) G(3) G(4) G(8) +#endif +#endif + #define SPECIFICS_II(G) \ G(1,1) G(1,2) G(1,4) G(1,8) G(1,16) \ G(2,1) G(2,2) G(2,4) G(2,8) G(2,16) \ G(4,1) G(4,2) G(4,4) G(4,8) G(4,16) \ G(8,1) G(8,2) G(8,4) G(8,8) G(8,16) \ G(16,1) G(16,2) G(16,4) G(16,8) G(16,16) + +#if FLANG_SUPPORT_R16 #if __x86_64__ #define SPECIFICS_RI(G) \ G(2,1) G(2,2) G(2,4) G(2,8) G(2,16) \ @@ -190,7 +204,24 @@ end function ieee_round_ne G(8,1) G(8,2) G(8,4) G(8,8) G(8,16) \ G(16,1) G(16,2) G(16,4) G(16,8) G(16,16) #endif +#else +#if __x86_64__ +#define SPECIFICS_RI(G) \ + G(2,1) G(2,2) G(2,4) G(2,8) \ + G(3,1) G(3,2) G(3,4) G(3,8) \ + G(4,1) G(4,2) G(4,4) G(4,8) \ + G(8,1) G(8,2) G(8,4) G(8,8) \ + G(10,1) G(10,2) G(10,4) G(10,8) +#else +#define SPECIFICS_RI(G) \ + G(2,1) G(2,2) G(2,4) G(2,8) \ + G(3,1) G(3,2) G(3,4) G(3,8) \ + G(4,1) G(4,2) G(4,4) G(4,8) \ + G(8,1) G(8,2) G(8,4) G(8,8) +#endif +#endif +#if FLANG_SUPPORT_R16 #if __x86_64__ #define SPECIFICS_RR(G) \ G(2,2) G(2,3) G(2,4) G(2,8) G(2,10) G(2,16) \ @@ -207,6 +238,22 @@ end function ieee_round_ne G(8,2) G(8,3) G(8,4) G(8,8) G(8,16) \ G(16,2) G(16,3) G(16,4) G(16,8) G(16,16) #endif +#else +#if __x86_64__ +#define SPECIFICS_RR(G) \ + G(2,2) G(2,3) G(2,4) G(2,8) G(2,10) \ + G(3,2) G(3,3) G(3,4) G(3,8) G(3,10) \ + G(4,2) G(4,3) G(4,4) G(4,8) G(4,10) \ + G(8,2) G(8,3) G(8,4) G(8,8) G(8,10) \ + G(10,2) G(10,3) G(10,4) G(10,8) G(10,10) +#else +#define SPECIFICS_RR(G) \ + G(2,2) G(2,3) G(2,4) G(2,8) \ + G(3,2) G(3,3) G(3,4) G(3,8) \ + G(4,2) G(4,3) G(4,4) G(4,8) \ + G(8,2) G(8,3) G(8,4) G(8,8) +#endif +#endif #define IEEE_CLASS_R(XKIND) \ elemental type(ieee_class_type) function ieee_class_a##XKIND(x); \ @@ -462,8 +509,10 @@ end function ieee_real_a##AKIND##_i##KKIND; interface ieee_real SPECIFICS_I(IEEE_REAL_I) SPECIFICS_R(IEEE_REAL_R) +#if FLANG_SUPPORT_R16 SPECIFICS_II(IEEE_REAL_II) SPECIFICS_RI(IEEE_REAL_RI) +#endif end interface ieee_real public :: ieee_real #undef IEEE_REAL_I diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index 43ad1e3312b64..a18a5c6519eda 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -11,6 +11,16 @@ llvm_canonicalize_cmake_booleans( set(FLANG_TOOLS_DIR ${FLANG_BINARY_DIR}/bin) +# Check if 128-bit float computations can be done via long double +check_cxx_source_compiles( + "#include + #if LDBL_MANT_DIG != 113 + #error LDBL_MANT_DIG != 113 + #endif + int main() { return 0; } + " + HAVE_LDBL_MANT_DIG_113) + # FIXME In out-of-tree builds, "SHLIBDIR" is undefined and passing it to # `configure_lit_site_cfg` leads to a configuration error. This is currently # only required by plugins/examples, which are not supported in out-of-tree diff --git a/flang/test/Evaluate/fold-out_of_range.f90 b/flang/test/Evaluate/fold-out_of_range.f90 index 5a9f900beb2d5..6360eee5322bb 100644 --- a/flang/test/Evaluate/fold-out_of_range.f90 +++ b/flang/test/Evaluate/fold-out_of_range.f90 @@ -1,6 +1,7 @@ ! RUN: %python %S/test_folding.py %s %flang_fc1 -pedantic -triple x86_64-unknown-linux-gnu ! UNSUPPORTED: system-windows ! REQUIRES: target=x86_64{{.*}} +! REQUIRES: flang-supports-f128-math ! Tests folding of OUT_OF_RANGE(). module m integer(1), parameter :: i1v(*) = [ -huge(1_1) - 1_1, huge(1_1) ] diff --git a/flang/test/Evaluate/folding07.f90 b/flang/test/Evaluate/folding07.f90 index 3b6a99df38826..d51df7acf7b8a 100644 --- a/flang/test/Evaluate/folding07.f90 +++ b/flang/test/Evaluate/folding07.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: %python %S/test_folding.py %s %flang_fc1 ! Test numeric model inquiry intrinsics diff --git a/flang/test/Lower/Intrinsics/ieee_class_queries.f90 b/flang/test/Lower/Intrinsics/ieee_class_queries.f90 index bb7787ea903e2..b2f9df83a902a 100644 --- a/flang/test/Lower/Intrinsics/ieee_class_queries.f90 +++ b/flang/test/Lower/Intrinsics/ieee_class_queries.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: bbc -emit-fir -o - %s | FileCheck %s ! CHECK-LABEL: func @_QQmain diff --git a/flang/test/Lower/Intrinsics/ieee_unordered.f90 b/flang/test/Lower/Intrinsics/ieee_unordered.f90 index a6146eff7f06e..b7e81d53a2d75 100644 --- a/flang/test/Lower/Intrinsics/ieee_unordered.f90 +++ b/flang/test/Lower/Intrinsics/ieee_unordered.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s ! CHECK-LABEL: func @_QQmain diff --git a/flang/test/Lower/common-block.f90 b/flang/test/Lower/common-block.f90 index 94e61b8bcc33d..b5c1389df45d3 100644 --- a/flang/test/Lower/common-block.f90 +++ b/flang/test/Lower/common-block.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: bbc %s -o - | tco | FileCheck %s ! RUN: %flang -emit-llvm -S -mmlir -disable-external-name-interop %s -o - | FileCheck %s @@ -78,4 +79,3 @@ subroutine s7() real(16) r16 common /co1/ r16 end subroutine - diff --git a/flang/test/Semantics/kinds03.f90 b/flang/test/Semantics/kinds03.f90 index 751d4a9ffa3cb..a15a4a9baa731 100644 --- a/flang/test/Semantics/kinds03.f90 +++ b/flang/test/Semantics/kinds03.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: %python %S/test_symbols.py %s %flang_fc1 !DEF: /MainProgram1/ipdt DerivedType !DEF: /MainProgram1/ipdt/k TypeParam INTEGER(4) diff --git a/flang/test/Semantics/modfile26.f90 b/flang/test/Semantics/modfile26.f90 index 09dc8b0954d63..9c5d0015d9163 100644 --- a/flang/test/Semantics/modfile26.f90 +++ b/flang/test/Semantics/modfile26.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: %python %S/test_modfile.py %s %flang_fc1 ! Intrinsics SELECTED_INT_KIND, SELECTED_REAL_KIND, PRECISION, RANGE, ! RADIX, DIGITS diff --git a/flang/test/Semantics/realkinds-aarch64-01.f90 b/flang/test/Semantics/realkinds-aarch64-01.f90 index e22920ff991e9..2520c0b84c0e6 100644 --- a/flang/test/Semantics/realkinds-aarch64-01.f90 +++ b/flang/test/Semantics/realkinds-aarch64-01.f90 @@ -1,4 +1,5 @@ ! REQUIRES: aarch64-registered-target +! REQUIRES: flang-supports-f128-math ! RUN: %python %S/test_modfile.py %s %flang_fc1 -triple aarch64-unknown-linux-gnu module m1 diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py index 37869e7e2ecd7..4acbc0606d197 100644 --- a/flang/test/lit.cfg.py +++ b/flang/test/lit.cfg.py @@ -216,8 +216,9 @@ # Add features and substitutions to test F128 math support. # %f128-lib substitution may be used to generate check prefixes # for LIT tests checking for F128 library support. -if config.flang_runtime_f128_math_lib: +if config.flang_runtime_f128_math_lib or config.have_ldbl_mant_dig_113: config.available_features.add("flang-supports-f128-math") +if config.flang_runtime_f128_math_lib: config.available_features.add( "flang-f128-math-lib-" + config.flang_runtime_f128_math_lib ) diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in index fe6186d714071..d1a0ac763cf8a 100644 --- a/flang/test/lit.site.cfg.py.in +++ b/flang/test/lit.site.cfg.py.in @@ -31,6 +31,7 @@ if "openmp" in "@LLVM_ENABLE_RUNTIMES@".lower().split(";"): else: config.openmp_module_dir = None config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@" +config.have_ldbl_mant_dig_113 = "@HAVE_LDBL_MANT_DIG_113@" import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index cec4e2d810720..344a781c41e95 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -31,6 +31,25 @@ set(MODULES_WITHOUT_IMPLEMENTATION set(MODULES ${MODULES_WITH_IMPLEMENTATION} ${MODULES_WITHOUT_IMPLEMENTATION}) +# Check if 128-bit float computations can be done via long double. +check_cxx_source_compiles( + "#include + #if LDBL_MANT_DIG != 113 + #error LDBL_MANT_DIG != 113 + #endif + int main() { return 0; } + " + HAVE_LDBL_MANT_DIG_113) + +# Figure out whether we can support REAL(KIND=16) +if (FLANG_RUNTIME_F128_MATH_LIB) + set(FLANG_SUPPORT_R16 "1") +elseif (HAVE_LDBL_MANT_DIG_113) + set(FLANG_SUPPORT_R16 "1") +else() + set(FLANG_SUPPORT_R16 "0") +endif() + # Init variable to hold extra object files coming from the Fortran modules; # these module files will be contributed from the CMakeLists in flang/tools/f18. set(module_objects "") @@ -76,6 +95,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() endif() + set(decls "") + if (FLANG_SUPPORT_R16) + set(decls "-DFLANG_SUPPORT_R16") + endif() # Some modules have an implementation part that needs to be added to the # FortranRuntime library. @@ -92,7 +115,7 @@ if (NOT CMAKE_CROSSCOMPILING) # TODO: We may need to flag this with conditional, in case Flang is built w/o OpenMP support add_custom_command(OUTPUT ${base}.mod ${object_output} COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR} - COMMAND flang-new ${opts} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR} + COMMAND flang-new ${opts} ${decls} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR} ${FLANG_SOURCE_DIR}/module/${filename}.f90 DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends} ) From e80f48986c7ba6cc41378b8d8e12d804cf26895d Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Thu, 5 Sep 2024 17:01:56 +0200 Subject: [PATCH 237/425] [SCEV] BECount to zero if `((-C + (C smax %x)) /u %x), C > 0` holds The SCEV expression `((-C + (C smax %x)) /u %x)` can be folded to zero for any positive constant C. Proof: https://alive2.llvm.org/ce/z/_dLm8C. --- llvm/lib/Analysis/ScalarEvolution.cpp | 16 ++++ .../udiv-of-x-xsmaxone-fold.ll | 96 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 6b4a81c217b3c..57e03f667ba6f 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -3547,6 +3547,22 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, } } + // ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C. + if (const auto *AE = dyn_cast(LHS); + AE && AE->getNumOperands() == 2) { + if (const auto *VC = dyn_cast(AE->getOperand(0))) { + const APInt &NegC = VC->getAPInt(); + if (NegC.isNegative() && !NegC.isMinSignedValue()) { + const auto *MME = dyn_cast(AE->getOperand(1)); + if (MME && MME->getNumOperands() == 2 && + isa(MME->getOperand(0)) && + cast(MME->getOperand(0))->getAPInt() == -NegC && + MME->getOperand(1) == RHS) + return getZero(LHS->getType()); + } + } + } + // The Insertion Point (IP) might be invalid by now (due to UniqueSCEVs // changes). Make sure we get a new one. IP = nullptr; diff --git a/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll b/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll new file mode 100644 index 0000000000000..9405c0f726ac7 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +define i32 @test_expr_with_constant_1(i32 %x) { +; CHECK-LABEL: 'test_expr_with_constant_1' +; CHECK-NEXT: Classifying expressions for: @test_expr_with_constant_1 +; CHECK-NEXT: %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 1) +; CHECK-NEXT: --> (1 smax %x) U: [1,-2147483648) S: [1,-2147483648) +; CHECK-NEXT: %add = add nsw i32 %smax, -1 +; CHECK-NEXT: --> (-1 + (1 smax %x)) U: [0,2147483647) S: [0,2147483647) +; CHECK-NEXT: %udiv = udiv i32 %add, %x +; CHECK-NEXT: --> 0 U: [0,1) S: [0,1) +; CHECK-NEXT: Determining loop execution counts for: @test_expr_with_constant_1 +; +entry: + %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 1) + %add = add nsw i32 %smax, -1 + %udiv = udiv i32 %add, %x + ret i32 %udiv +} + +; Non-1 constant: (-2 + (2 smax %x)) /u %x +define i32 @test_expr_with_constant_2(i32 %x) { +; CHECK-LABEL: 'test_expr_with_constant_2' +; CHECK-NEXT: Classifying expressions for: @test_expr_with_constant_2 +; CHECK-NEXT: %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2) +; CHECK-NEXT: --> (2 smax %x) U: [2,-2147483648) S: [2,-2147483648) +; CHECK-NEXT: %add = add nsw i32 %smax, -2 +; CHECK-NEXT: --> (-2 + (2 smax %x)) U: [0,2147483646) S: [0,2147483646) +; CHECK-NEXT: %udiv = udiv i32 %add, %x +; CHECK-NEXT: --> 0 U: [0,1) S: [0,1) +; CHECK-NEXT: Determining loop execution counts for: @test_expr_with_constant_2 +; +entry: + %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2) + %add = add nsw i32 %smax, -2 + %udiv = udiv i32 %add, %x + ret i32 %udiv +} + +; Negative test, constants mismatch: (-3 + (2 smax %x)) /u %x +define i32 @test_expr_mismatch_constants(i32 %x) { +; CHECK-LABEL: 'test_expr_mismatch_constants' +; CHECK-NEXT: Classifying expressions for: @test_expr_mismatch_constants +; CHECK-NEXT: %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2) +; CHECK-NEXT: --> (2 smax %x) U: [2,-2147483648) S: [2,-2147483648) +; CHECK-NEXT: %add = add nsw i32 %smax, -3 +; CHECK-NEXT: --> (-3 + (2 smax %x)) U: [-1,2147483645) S: [-1,2147483645) +; CHECK-NEXT: %udiv = udiv i32 %add, %x +; CHECK-NEXT: --> ((-3 + (2 smax %x)) /u %x) U: full-set S: full-set +; CHECK-NEXT: Determining loop execution counts for: @test_expr_mismatch_constants +; +entry: + %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2) + %add = add nsw i32 %smax, -3 + %udiv = udiv i32 %add, %x + ret i32 %udiv +} + +; Negative constant: (3 + (-3 smax %x)) /u %x +define i32 @test_expr_negative_constant(i32 %x) { +; CHECK-LABEL: 'test_expr_negative_constant' +; CHECK-NEXT: Classifying expressions for: @test_expr_negative_constant +; CHECK-NEXT: %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 -3) +; CHECK-NEXT: --> (-3 smax %x) U: [-3,-2147483648) S: [-3,-2147483648) +; CHECK-NEXT: %add = add nsw i32 %smax, 3 +; CHECK-NEXT: --> (3 + (-3 smax %x)) U: [0,-2147483645) S: [0,-2147483645) +; CHECK-NEXT: %udiv = udiv i32 %add, %x +; CHECK-NEXT: --> ((3 + (-3 smax %x)) /u %x) U: [0,-2147483645) S: [0,-2147483645) +; CHECK-NEXT: Determining loop execution counts for: @test_expr_negative_constant +; +entry: + %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 -3) + %add = add nsw i32 %smax, 3 + %udiv = udiv i32 %add, %x + ret i32 %udiv +} + +; Negative signed minimum value. +define i8 @text_expr_with_constant_signed_min(i8 %x) { +; CHECK-LABEL: 'text_expr_with_constant_signed_min' +; CHECK-NEXT: Classifying expressions for: @text_expr_with_constant_signed_min +; CHECK-NEXT: %smax = tail call i8 @llvm.smax.i8(i8 %x, i8 -128) +; CHECK-NEXT: --> %x U: full-set S: full-set +; CHECK-NEXT: %add = add nsw i8 %smax, -128 +; CHECK-NEXT: --> (-128 + %x) U: full-set S: full-set +; CHECK-NEXT: %udiv = udiv i8 %add, %x +; CHECK-NEXT: --> ((-128 + %x) /u %x) U: full-set S: full-set +; CHECK-NEXT: Determining loop execution counts for: @text_expr_with_constant_signed_min +; +entry: + %smax = tail call i8 @llvm.smax.i8(i8 %x, i8 128) + %add = add nsw i8 %smax, -128 + %udiv = udiv i8 %add, %x + ret i8 %udiv +} From 7eca38ce76d5d1915f4ab7e665964062c0b37697 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Thu, 5 Sep 2024 16:13:11 +0100 Subject: [PATCH 238/425] Reland "[clang] Add nuw attribute to GEPs (#105496)" (#107257) Add nuw attribute to inbounds GEPs where the expression used to form the GEP is an addition of unsigned indices. Relands #105496, which was reverted because it exposed a miscompilation arising from #98608. This is now fixed by #106512. --- clang/lib/CodeGen/CGBuilder.h | 6 +- clang/lib/CodeGen/CGExprScalar.cpp | 17 +- clang/test/CodeGen/2005-01-02-ConstantInits.c | 4 +- clang/test/CodeGen/PowerPC/ppc-emmintrin.c | 12 +- clang/test/CodeGen/PowerPC/ppc-xmmintrin.c | 16 +- clang/test/CodeGen/attr-counted-by.c | 16 +- ...d-nonzero-offset-when-nullptr-is-defined.c | 2 +- .../catch-nullptr-and-nonzero-offset.c | 64 ++--- .../CodeGen/catch-pointer-overflow-volatile.c | 2 +- clang/test/CodeGen/catch-pointer-overflow.c | 6 +- clang/test/CodeGen/ext-int.c | 2 +- .../test/CodeGen/hexagon-brev-ld-ptr-incdec.c | 6 +- clang/test/CodeGen/integer-overflow.c | 6 +- clang/test/CodeGen/ms-intrinsics.c | 12 +- clang/test/CodeGen/ubsan-pointer-overflow.m | 2 +- clang/test/CodeGen/vla.c | 2 +- .../attr-likelihood-iteration-stmt.cpp | 36 +-- clang/test/CodeGenCXX/for-range.cpp | 12 +- .../CodeGenCXX/pr45964-decomp-transform.cpp | 2 +- clang/test/CodeGenCXX/vla.cpp | 4 +- .../CodeGenHLSL/buffer-array-operator.hlsl | 4 +- .../CodeGenSYCL/address-space-deduction.cpp | 48 ++-- clang/test/Headers/__clang_hip_math.hip | 70 ++--- clang/test/OpenMP/bug60602.cpp | 8 +- clang/test/OpenMP/declare_mapper_codegen.cpp | 4 +- clang/test/OpenMP/distribute_codegen.cpp | 32 +-- ...te_parallel_for_reduction_task_codegen.cpp | 24 +- clang/test/OpenMP/distribute_simd_codegen.cpp | 96 +++---- clang/test/OpenMP/for_linear_codegen.cpp | 2 +- clang/test/OpenMP/for_reduction_codegen.cpp | 64 ++--- .../test/OpenMP/for_reduction_codegen_UDR.cpp | 32 +-- .../OpenMP/for_reduction_task_codegen.cpp | 24 +- clang/test/OpenMP/for_scan_codegen.cpp | 40 +-- clang/test/OpenMP/for_simd_scan_codegen.cpp | 40 +-- clang/test/OpenMP/irbuilder_for_iterator.cpp | 6 +- clang/test/OpenMP/irbuilder_for_rangefor.cpp | 6 +- clang/test/OpenMP/irbuilder_for_unsigned.c | 8 +- .../test/OpenMP/irbuilder_for_unsigned_auto.c | 8 +- .../test/OpenMP/irbuilder_for_unsigned_down.c | 2 +- .../OpenMP/irbuilder_for_unsigned_dynamic.c | 8 +- .../irbuilder_for_unsigned_dynamic_chunked.c | 8 +- .../OpenMP/irbuilder_for_unsigned_runtime.c | 8 +- .../irbuilder_for_unsigned_static_chunked.c | 8 +- clang/test/OpenMP/map_struct_ordering.cpp | 2 +- .../master_taskloop_in_reduction_codegen.cpp | 10 +- .../master_taskloop_reduction_codegen.cpp | 12 +- ...ter_taskloop_simd_in_reduction_codegen.cpp | 10 +- ...master_taskloop_simd_reduction_codegen.cpp | 12 +- clang/test/OpenMP/ordered_codegen.cpp | 80 +++--- clang/test/OpenMP/parallel_for_codegen.cpp | 144 +++++----- .../OpenMP/parallel_for_linear_codegen.cpp | 2 +- .../parallel_for_reduction_task_codegen.cpp | 24 +- .../test/OpenMP/parallel_for_scan_codegen.cpp | 44 ++-- .../OpenMP/parallel_for_simd_scan_codegen.cpp | 40 +-- ...parallel_master_reduction_task_codegen.cpp | 24 +- ...llel_master_taskloop_reduction_codegen.cpp | 12 +- ...master_taskloop_simd_reduction_codegen.cpp | 12 +- .../OpenMP/parallel_reduction_codegen.cpp | 12 +- .../parallel_reduction_task_codegen.cpp | 24 +- ...rallel_sections_reduction_task_codegen.cpp | 24 +- clang/test/OpenMP/reduction_implicit_map.cpp | 50 ++-- .../sections_reduction_task_codegen.cpp | 24 +- .../target_data_use_device_addr_codegen.cpp | 10 +- .../target_data_use_device_ptr_codegen.cpp | 82 +++--- .../OpenMP/target_has_device_addr_codegen.cpp | 24 +- .../OpenMP/target_in_reduction_codegen.cpp | 10 +- .../OpenMP/target_is_device_ptr_codegen.cpp | 208 +++++++-------- ...arget_map_both_pointer_pointee_codegen.cpp | 2 +- clang/test/OpenMP/target_map_codegen_01.cpp | 2 +- clang/test/OpenMP/target_map_codegen_21.cpp | 6 +- clang/test/OpenMP/target_map_codegen_27.cpp | 2 +- clang/test/OpenMP/target_map_codegen_28.cpp | 6 +- clang/test/OpenMP/target_map_codegen_29.cpp | 4 +- .../OpenMP/target_map_deref_array_codegen.cpp | 2 +- ..._map_member_expr_array_section_codegen.cpp | 8 +- .../OpenMP/target_map_member_expr_codegen.cpp | 6 +- ...target_map_nest_defalut_mapper_codegen.cpp | 2 +- ...et_parallel_for_reduction_task_codegen.cpp | 24 +- ...target_parallel_reduction_task_codegen.cpp | 24 +- .../OpenMP/target_task_affinity_codegen.cpp | 24 +- ...te_parallel_for_reduction_task_codegen.cpp | 44 ++-- clang/test/OpenMP/target_update_codegen.cpp | 34 +-- clang/test/OpenMP/task_codegen.c | 2 +- clang/test/OpenMP/task_codegen.cpp | 248 +++++++++--------- .../test/OpenMP/task_in_reduction_codegen.cpp | 10 +- .../taskgroup_task_reduction_codegen.cpp | 10 +- .../OpenMP/taskloop_in_reduction_codegen.cpp | 10 +- .../OpenMP/taskloop_reduction_codegen.cpp | 12 +- .../taskloop_simd_in_reduction_codegen.cpp | 10 +- .../taskloop_simd_reduction_codegen.cpp | 12 +- ...te_parallel_for_reduction_task_codegen.cpp | 44 ++-- 91 files changed, 1118 insertions(+), 1105 deletions(-) diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h index 08730a6a6672a..b8036cf6e6a30 100644 --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -14,6 +14,7 @@ #include "CodeGenTypeCache.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/GEPNoWrapFlags.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" @@ -334,9 +335,10 @@ class CGBuilderTy : public CGBuilderBaseTy { Address CreateGEP(Address Addr, ArrayRef IdxList, llvm::Type *ElementType, CharUnits Align, - const Twine &Name = "") { + const Twine &Name = "", + llvm::GEPNoWrapFlags NW = llvm::GEPNoWrapFlags::none()) { llvm::Value *Ptr = emitRawPointerFromAddress(Addr); - return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name), + return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name, NW), ElementType, Align); } diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 7aa2d3d89c293..88fbbe6c4c965 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/FixedPointBuilder.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GEPNoWrapFlags.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Intrinsics.h" @@ -5759,7 +5760,12 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr, bool SignedIndices, bool IsSubtraction, SourceLocation Loc, const Twine &Name) { llvm::Type *PtrTy = Ptr->getType(); - Value *GEPVal = Builder.CreateInBoundsGEP(ElemTy, Ptr, IdxList, Name); + + llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds(); + if (!SignedIndices && !IsSubtraction) + NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap(); + + Value *GEPVal = Builder.CreateGEP(ElemTy, Ptr, IdxList, Name, NWFlags); // If the pointer overflow sanitizer isn't enabled, do nothing. if (!SanOpts.has(SanitizerKind::PointerOverflow)) @@ -5874,8 +5880,13 @@ Address CodeGenFunction::EmitCheckedInBoundsGEP( Address Addr, ArrayRef IdxList, llvm::Type *elementType, bool SignedIndices, bool IsSubtraction, SourceLocation Loc, CharUnits Align, const Twine &Name) { - if (!SanOpts.has(SanitizerKind::PointerOverflow)) - return Builder.CreateInBoundsGEP(Addr, IdxList, elementType, Align, Name); + if (!SanOpts.has(SanitizerKind::PointerOverflow)) { + llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds(); + if (!SignedIndices && !IsSubtraction) + NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap(); + + return Builder.CreateGEP(Addr, IdxList, elementType, Align, Name, NWFlags); + } return RawAddress( EmitCheckedInBoundsGEP(Addr.getElementType(), Addr.emitRawPointer(*this), diff --git a/clang/test/CodeGen/2005-01-02-ConstantInits.c b/clang/test/CodeGen/2005-01-02-ConstantInits.c index 7772a64331ffb..d90c2ea42da61 100644 --- a/clang/test/CodeGen/2005-01-02-ConstantInits.c +++ b/clang/test/CodeGen/2005-01-02-ConstantInits.c @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "@.+" +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "[A-Za-z].*" // RUN: %clang_cc1 -triple=x86_64-unknown-linux %s -emit-llvm -o - | FileCheck %s // This tests all kinds of hard cases with initializers and @@ -51,7 +51,7 @@ int foo(int i) { return bar(&Arr[49])+bar(&Arr[i]); } // CHECK-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 // CHECK-NEXT: store ptr @Arr, ptr [[P]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[P]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I_ADDR]], align 4 // CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[TMP1]] to i64 diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c index a3650beec625f..4c4d0dfce05ea 100644 --- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c +++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c @@ -1012,14 +1012,14 @@ test_shuffle() { // CHECK: %[[SHR:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR]], 3 // CHECK: sext i32 %[[AND4]] to i64 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK: add i32 %{{[0-9a-zA-Z_.]+}}, 269488144 // CHECK: call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16]) @@ -1050,7 +1050,7 @@ test_shuffle() { // CHECK: sext i32 %[[AND4]] to i64 // CHECK-LE: store <2 x i64> , ptr %{{[0-9a-zA-Z_.]+}}, align 16 // CHECK-BE: store <2 x i64> , ptr %{{[0-9a-zA-Z_.]+}}, align 16 -// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}} +// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}} // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16]) // CHECK-LABEL: define available_externally <2 x i64> @_mm_shufflelo_epi16 @@ -1067,7 +1067,7 @@ test_shuffle() { // CHECK: sext i32 %[[AND4]] to i64 // CHECK-LE: store <2 x i64> , ptr %{{[0-9a-zA-Z_.]+}}, align 16 // CHECK-BE: store <2 x i64> , ptr %{{[0-9a-zA-Z_.]+}}, align 16 -// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}} +// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}} // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16]) void __attribute__((noinline)) diff --git a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c index 95dfd1202f157..4a15fa9f76cee 100644 --- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c +++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c @@ -894,16 +894,16 @@ test_shuffle() { // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3 // CHECK: sext i32 %[[AND4]] to i64 -// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3 -// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2 -// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1 -// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}} // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0 // CHECK: call <2 x i64> @vec_splats(unsigned long long) @@ -923,14 +923,14 @@ test_shuffle() { // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3 // CHECK: sext i32 %[[AND4]] to i64 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2 -// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 +// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64 // CHECK: %[[ADD2:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD2]], i32 3 // CHECK: call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16]) diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index ab36b6e7720ba..a06e815737f4e 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -118,7 +118,7 @@ void test1(struct annotated *p, int index, int val) { // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2 // SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 // SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]] @@ -134,7 +134,7 @@ void test1(struct annotated *p, int index, int val) { // NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 // NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP0]] // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // NO-SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITH-ATTR-NEXT: ret void // @@ -142,7 +142,7 @@ void test1(struct annotated *p, int index, int val) { // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITHOUT-ATTR-NEXT: entry: // SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // SANITIZE-WITHOUT-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // SANITIZE-WITHOUT-ATTR-NEXT: ret void // @@ -150,7 +150,7 @@ void test1(struct annotated *p, int index, int val) { // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: // NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // NO-SANITIZE-WITHOUT-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITHOUT-ATTR-NEXT: ret void // @@ -207,7 +207,7 @@ size_t test2_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 // SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2 // SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4) @@ -231,7 +231,7 @@ size_t test2_bdos(struct annotated *p) { // NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 // NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]] // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // NO-SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITH-ATTR-NEXT: ret void // @@ -239,7 +239,7 @@ size_t test2_bdos(struct annotated *p) { // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITHOUT-ATTR-NEXT: entry: // SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // SANITIZE-WITHOUT-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // SANITIZE-WITHOUT-ATTR-NEXT: ret void // @@ -247,7 +247,7 @@ size_t test2_bdos(struct annotated *p) { // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: // NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] // NO-SANITIZE-WITHOUT-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITHOUT-ATTR-NEXT: ret void // diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c index 39ede01d6e3b8..8a560a47ad1e1 100644 --- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c +++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c @@ -33,7 +33,7 @@ char *add_unsigned(char *base, unsigned long offset) { // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c index e93dbcb9f647b..d884993ffb2b3 100644 --- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c +++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c @@ -50,7 +50,7 @@ char *var_var(char *base, unsigned long offset) { // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize @@ -83,7 +83,7 @@ char *var_zero(char *base) { // CHECK-NEXT: %[[BASE_ADDR:.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 0 + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 0 // CHECK-SANITIZE-C-NEXT: %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize @@ -111,7 +111,7 @@ char *var_one(char *base) { // CHECK-NEXT: %[[BASE_ADDR:.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 1 + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 1 // CHECK-SANITIZE-NEXT: %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize @@ -140,7 +140,7 @@ char *var_allones(char *base) { // CHECK-NEXT: %[[BASE_ADDR:.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 -1 + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 -1 // CHECK-SANITIZE-NEXT: %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], -1, !nosanitize // CHECK-SANITIZE-NEXT: %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize @@ -171,7 +171,7 @@ char *nullptr_var(unsigned long offset) { // CHECK-NEXT: %[[OFFSET_ADDR:.*]] = alloca i64, align 8 // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr null, i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr null, i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize @@ -217,17 +217,17 @@ char *nullptr_zero(void) { char *nullptr_one_BAD(void) { // CHECK: define{{.*}} ptr @nullptr_one_BAD() // CHECK-NEXT: [[ENTRY:.*]]: - // CHECK-SANITIZE-NEXT: %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr null, i64 1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr null, i64 1) static char *const base = (char *)0; static const unsigned long offset = 1; #line 700 @@ -237,17 +237,17 @@ char *nullptr_one_BAD(void) { char *nullptr_allones_BAD(void) { // CHECK: define{{.*}} ptr @nullptr_allones_BAD() // CHECK-NEXT: [[ENTRY:.*]]: - // CHECK-SANITIZE-NEXT: %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr null, i64 -1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) static char *const base = (char *)0; static const unsigned long offset = -1; #line 800 @@ -262,7 +262,7 @@ char *one_var(unsigned long offset) { // CHECK-NEXT: %[[OFFSET_ADDR:.*]] = alloca i64, align 8 // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize @@ -312,17 +312,17 @@ char *one_one_OK(void) { // CHECK: define{{.*}} ptr @one_one_OK() // CHECK-NEXT: [[ENTRY:.*]]: // CHECK-SANITIZE-NEXT: %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize - // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) static char *const base = (char *)1; static const unsigned long offset = 1; #line 1100 @@ -333,17 +333,17 @@ char *one_allones_BAD(void) { // CHECK: define{{.*}} ptr @one_allones_BAD() // CHECK-NEXT: [[ENTRY:.*]]: // CHECK-SANITIZE-NEXT: %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize - // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) static char *const base = (char *)1; static const unsigned long offset = -1; #line 1200 @@ -358,7 +358,7 @@ char *allones_var(unsigned long offset) { // CHECK-NEXT: %[[OFFSET_ADDR:.*]] = alloca i64, align 8 // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize @@ -408,17 +408,17 @@ char *allones_one_BAD(void) { // CHECK: define{{.*}} ptr @allones_one_BAD() // CHECK-NEXT: [[ENTRY:.*]]: // CHECK-SANITIZE-NEXT: %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize - // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) static char *const base = (char *)-1; static const unsigned long offset = 1; #line 1500 @@ -429,17 +429,17 @@ char *allones_allones_OK(void) { // CHECK: define{{.*}} ptr @allones_allones_OK() // CHECK-NEXT: [[ENTRY:.*]]: // CHECK-SANITIZE-NEXT: %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize - // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize + // CHECK-SANITIZE-NEXT: %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize // CHECK-SANITIZE-C-NEXT: %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-CPP-NEXT: %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize // CHECK-SANITIZE-NEXT: br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize // CHECK-SANITIZE: [[HANDLER_POINTER_OVERFLOW]]: - // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1)) - // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1)) + // CHECK-SANITIZE-NORECOVER-NEXT: call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1)) + // CHECK-SANITIZE-RECOVER-NEXT: call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1)) // CHECK-SANITIZE-TRAP-NEXT: call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize // CHECK-SANITIZE-UNREACHABLE-NEXT: unreachable, !nosanitize // CHECK-SANITIZE: [[CONT]]: - // CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) + // CHECK-NEXT: ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) static char *const base = (char *)-1; static const unsigned long offset = -1; #line 1600 @@ -461,7 +461,7 @@ char *void_ptr(void *base, unsigned long offset) { // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize diff --git a/clang/test/CodeGen/catch-pointer-overflow-volatile.c b/clang/test/CodeGen/catch-pointer-overflow-volatile.c index 4b0653a0ae59e..626bbc0db7afb 100644 --- a/clang/test/CodeGen/catch-pointer-overflow-volatile.c +++ b/clang/test/CodeGen/catch-pointer-overflow-volatile.c @@ -23,7 +23,7 @@ char *volatile_ptr(char *volatile base, unsigned long offset) { // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load volatile ptr, ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize diff --git a/clang/test/CodeGen/catch-pointer-overflow.c b/clang/test/CodeGen/catch-pointer-overflow.c index 899af73bd81e0..1f7f1729098c7 100644 --- a/clang/test/CodeGen/catch-pointer-overflow.c +++ b/clang/test/CodeGen/catch-pointer-overflow.c @@ -30,7 +30,7 @@ char *add_unsigned(char *base, unsigned long offset) { // CHECK-NEXT: store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]] // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize @@ -179,7 +179,7 @@ char *postinc(char *base) { // CHECK-NEXT: %[[BASE_ADDR:.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1 + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1 // CHECK-SANITIZE-NEXT: %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize @@ -241,7 +241,7 @@ char *preinc(char *base) { // CHECK-NEXT: %[[BASE_ADDR:.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8 // CHECK-NEXT: %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8 - // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1 + // CHECK-NEXT: %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1 // CHECK-SANITIZE-NEXT: %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize // CHECK-SANITIZE-NEXT: %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize // CHECK-SANITIZE-NEXT: %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize diff --git a/clang/test/CodeGen/ext-int.c b/clang/test/CodeGen/ext-int.c index 714b7e122a706..e3d609a4ba4a2 100644 --- a/clang/test/CodeGen/ext-int.c +++ b/clang/test/CodeGen/ext-int.c @@ -154,7 +154,7 @@ _BitInt(129) *f1(_BitInt(129) *p) { } char *f2(char *p) { - // CHECK64: getelementptr inbounds i8, {{.*}} i64 24 + // CHECK64: getelementptr inbounds nuw i8, {{.*}} i64 24 return p + sizeof(_BitInt(129)); } diff --git a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c index 7802168de4d75..d25d1e04f15fa 100644 --- a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c +++ b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c @@ -6,9 +6,9 @@ // the return value will be the value in A[2] // CHECK: @brev_ptr_inc // CHECK-DAG: llvm.hexagon.L2.loadri.pbr -// CHECK-DAG: getelementptr inbounds i8, {{.*}}i32 4 -// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 8 -// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 4 +// CHECK-DAG: getelementptr inbounds nuw i8, {{.*}}i32 4 +// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 8 +// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 4 int brev_ptr_inc(int A[], int B[]) { int *p0 = &B[0]; int *p1 = &A[0]; diff --git a/clang/test/CodeGen/integer-overflow.c b/clang/test/CodeGen/integer-overflow.c index 461b026d39615..9e8cde8b33b16 100644 --- a/clang/test/CodeGen/integer-overflow.c +++ b/clang/test/CodeGen/integer-overflow.c @@ -60,10 +60,10 @@ void test1(void) { // -fwrapv should turn off inbounds for GEP's, PR9256 extern int* P; ++P; - // DEFAULT: getelementptr inbounds i32, ptr + // DEFAULT: getelementptr inbounds nuw i32, ptr // WRAPV: getelementptr i32, ptr - // TRAPV: getelementptr inbounds i32, ptr - // CATCH_UB_POINTER: getelementptr inbounds i32, ptr + // TRAPV: getelementptr inbounds nuw i32, ptr + // CATCH_UB_POINTER: getelementptr inbounds nuw i32, ptr // NOCATCH_UB_POINTER: getelementptr i32, ptr // PR9350: char pre-increment never overflows. diff --git a/clang/test/CodeGen/ms-intrinsics.c b/clang/test/CodeGen/ms-intrinsics.c index c3d64fda0b901..459a708d9b2e0 100644 --- a/clang/test/CodeGen/ms-intrinsics.c +++ b/clang/test/CodeGen/ms-intrinsics.c @@ -156,7 +156,7 @@ unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) { // CHECK: [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ] // CHECK: ret i8 [[RESULT]] // CHECK: [[ISNOTZERO_LABEL]]: -// CHECK: [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4 +// CHECK: [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4 // CHECK: [[INDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %Mask, i1 true) // CHECK: store i32 [[INDEX]], ptr [[IDXGEP]], align 4 // CHECK: br label %[[END_LABEL]] @@ -171,7 +171,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) { // CHECK: [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ] // CHECK: ret i8 [[RESULT]] // CHECK: [[ISNOTZERO_LABEL]]: -// CHECK: [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4 +// CHECK: [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4 // CHECK: [[REVINDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %Mask, i1 true) // CHECK: [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31 // CHECK: store i32 [[INDEX]], ptr [[IDXGEP]], align 4 @@ -437,10 +437,10 @@ unsigned char test_InterlockedCompareExchange128( ++ExchangeLow, ++ComparandResult); } // CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{[a-z_ ]*}}%ComparandResult){{.*}}{ -// CHECK-64: %incdec.ptr = getelementptr inbounds i8, ptr %Destination, i64 8 +// CHECK-64: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Destination, i64 8 // CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1 // CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1 -// CHECK-64: %incdec.ptr2 = getelementptr inbounds i8, ptr %ComparandResult, i64 8 +// CHECK-64: %incdec.ptr2 = getelementptr inbounds nuw i8, ptr %ComparandResult, i64 8 // CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128 // CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128 // CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64 @@ -486,7 +486,7 @@ short test_InterlockedIncrement16(short volatile *Addend) { return _InterlockedIncrement16(++Addend); } // CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{ -// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 2 +// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 2 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i16 1 seq_cst, align 2 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1 // CHECK: ret i16 [[RESULT]] @@ -496,7 +496,7 @@ long test_InterlockedIncrement(long volatile *Addend) { return _InterlockedIncrement(++Addend); } // CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{ -// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 4 +// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 4 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i32 1 seq_cst, align 4 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1 // CHECK: ret i32 [[RESULT]] diff --git a/clang/test/CodeGen/ubsan-pointer-overflow.m b/clang/test/CodeGen/ubsan-pointer-overflow.m index 9192598da92fc..4ecdac655669f 100644 --- a/clang/test/CodeGen/ubsan-pointer-overflow.m +++ b/clang/test/CodeGen/ubsan-pointer-overflow.m @@ -5,7 +5,7 @@ void variable_len_array_arith(int n, int k) { int vla[n]; int (*p)[n] = &vla; - // CHECK: getelementptr inbounds i32, ptr {{.*}}, i64 [[INC:%.*]] + // CHECK: getelementptr inbounds nuw i32, ptr {{.*}}, i64 [[INC:%.*]] // CHECK: @llvm.smul.with.overflow.i64(i64 4, i64 [[INC]]), !nosanitize // CHECK-NOT: select // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}} diff --git a/clang/test/CodeGen/vla.c b/clang/test/CodeGen/vla.c index 33621c5dd7a29..a22ba727df2fe 100644 --- a/clang/test/CodeGen/vla.c +++ b/clang/test/CodeGen/vla.c @@ -120,7 +120,7 @@ int test4(unsigned n, char (*p)[n][n+1][6]) { // CHECK-NEXT: [[T2:%.*]] = udiv i32 [[T1]], 2 // CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[DIM0]], [[DIM1]] // CHECK-NEXT: [[T4:%.*]] = mul nsw i32 [[T2]], [[T3]] - // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds [6 x i8], ptr [[T0]], i32 [[T4]] + // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds nuw [6 x i8], ptr [[T0]], i32 [[T4]] // CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[N]], align 4 // CHECK-NEXT: [[T7:%.*]] = udiv i32 [[T6]], 4 // CHECK-NEXT: [[T8:%.*]] = sub i32 0, [[T7]] diff --git a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp index e842de6335046..fd9786de3a949 100644 --- a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp +++ b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp @@ -152,16 +152,16 @@ void f_branch_elided() // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]] // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0 -// CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16:![0-9]+]] +// CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]] // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0 // CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4 -// CHECK-NEXT: store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: br label [[FOR_COND:%.*]] // CHECK: for.cond: -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]] // CHECK-NEXT: [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true) // CHECK-NEXT: br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] @@ -172,16 +172,16 @@ void f_branch_elided() // CHECK-NEXT: br label [[FOR_END:%.*]] // CHECK: for.body: // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]] // CHECK-NEXT: store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]] // CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]] // CHECK-NEXT: br label [[FOR_INC:%.*]] // CHECK: for.inc: -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 -// CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1 +// CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] // CHECK: for.end: // CHECK-NEXT: ret void // @@ -204,16 +204,16 @@ void frl(int (&&e) [4]) // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]] // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0 -// CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]] // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0 // CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4 -// CHECK-NEXT: store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: br label [[FOR_COND:%.*]] // CHECK: for.cond: -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]] // CHECK-NEXT: [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false) // CHECK-NEXT: br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] @@ -224,16 +224,16 @@ void frl(int (&&e) [4]) // CHECK-NEXT: br label [[FOR_END:%.*]] // CHECK: for.body: // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]] // CHECK-NEXT: store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]] // CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]] // CHECK-NEXT: br label [[FOR_INC:%.*]] // CHECK: for.inc: -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 -// CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]] -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1 +// CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]] +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] // CHECK: for.end: // CHECK-NEXT: ret void // diff --git a/clang/test/CodeGenCXX/for-range.cpp b/clang/test/CodeGenCXX/for-range.cpp index 10d27206d12e4..088a34647c374 100644 --- a/clang/test/CodeGenCXX/for-range.cpp +++ b/clang/test/CodeGenCXX/for-range.cpp @@ -33,7 +33,7 @@ B *end(C&); extern B array[5]; -// CHECK-LABEL: define {{[^@]+}}@_Z9for_arrayv( +// CHECK-LABEL: @_Z9for_arrayv( // CHECK-NEXT: entry: // CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1 // CHECK-NEXT: [[__RANGE1:%.*]] = alloca ptr, align 8 @@ -57,7 +57,7 @@ extern B array[5]; // CHECK-NEXT: br label [[FOR_INC:%.*]] // CHECK: for.inc: // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP3]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP3]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8 // CHECK-NEXT: br label [[FOR_COND]] // CHECK: for.end: @@ -70,7 +70,7 @@ void for_array() { } } -// CHECK-LABEL: define {{[^@]+}}@_Z9for_rangev( +// CHECK-LABEL: @_Z9for_rangev( // CHECK-NEXT: entry: // CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1 // CHECK-NEXT: [[__RANGE1:%.*]] = alloca ptr, align 8 @@ -103,7 +103,7 @@ void for_array() { // CHECK-NEXT: br label [[FOR_INC:%.*]] // CHECK: for.inc: // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8 // CHECK-NEXT: br label [[FOR_COND]] // CHECK: for.end: @@ -116,7 +116,7 @@ void for_range() { } } -// CHECK-LABEL: define {{[^@]+}}@_Z16for_member_rangev( +// CHECK-LABEL: @_Z16for_member_rangev( // CHECK-NEXT: entry: // CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1 // CHECK-NEXT: [[__RANGE1:%.*]] = alloca ptr, align 8 @@ -149,7 +149,7 @@ void for_range() { // CHECK-NEXT: br label [[FOR_INC:%.*]] // CHECK: for.inc: // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8 // CHECK-NEXT: br label [[FOR_COND]] // CHECK: for.end: diff --git a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp index f7df110ec0129..bcb2d875dce66 100644 --- a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp +++ b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp @@ -16,7 +16,7 @@ void (*d)(){test_transform<0>}; // CHECK-NEXT: [[BODY]]: // CHECK-NEXT: [[CUR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[BODY]] ] // CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i32, ptr [[BEGIN]], i64 [[CUR]] -// CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds [1 x i32], ptr @a, i64 0, i64 [[CUR]] +// CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds nuw [1 x i32], ptr @a, i64 0, i64 [[CUR]] // CHECK-NEXT: [[X:%.*]] = load i32, ptr [[SRC]] // CHECK-NEXT: store i32 [[X]], ptr [[DEST]] // CHECK-NEXT: [[NEXT]] = add nuw i64 [[CUR]], 1 diff --git a/clang/test/CodeGenCXX/vla.cpp b/clang/test/CodeGenCXX/vla.cpp index 4cf2b3b445b40..aadf51fce3a44 100644 --- a/clang/test/CodeGenCXX/vla.cpp +++ b/clang/test/CodeGenCXX/vla.cpp @@ -83,7 +83,7 @@ void test2(int b) { //CHECK: [[VLA_SIZEOF:%.*]] = mul nuw i64 4, [[VLA_NUM_ELEMENTS_PRE]] //CHECK-NEXT: [[VLA_NUM_ELEMENTS_POST:%.*]] = udiv i64 [[VLA_SIZEOF]], 4 - //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]] + //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]] //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end1 //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]] for (int d : varr) 0; @@ -116,7 +116,7 @@ void test3(int b, int c) { //CHECK-NEXT: [[VLA_SIZEOF_DIM2:%.*]] = mul nuw i64 4, [[VLA_DIM2_PRE]] //CHECK-NEXT: [[VLA_NUM_ELEMENTS:%.*]] = udiv i64 [[VLA_SIZEOF]], [[VLA_SIZEOF_DIM2]] //CHECK-NEXT: [[VLA_END_INDEX:%.*]] = mul nsw i64 [[VLA_NUM_ELEMENTS]], [[VLA_DIM2_PRE]] - //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]] + //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]] //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]] diff --git a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl index f5556df30871c..02e570ebdcb4f 100644 --- a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl +++ b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl @@ -17,7 +17,7 @@ void fn(int Idx) { // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4 -// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1 +// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1 // CHECK-NEXT: ret ptr %arrayidx // Const comes next, and returns the pointer instead of the value. @@ -26,5 +26,5 @@ void fn(int Idx) { // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4 -// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1 +// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1 // CHECK-NEXT: ret ptr %arrayidx diff --git a/clang/test/CodeGenSYCL/address-space-deduction.cpp b/clang/test/CodeGenSYCL/address-space-deduction.cpp index 96075a47343fe..5910ec3bfc305 100644 --- a/clang/test/CodeGenSYCL/address-space-deduction.cpp +++ b/clang/test/CodeGenSYCL/address-space-deduction.cpp @@ -33,55 +33,55 @@ // CHECK-NEXT: store ptr addrspace(4) [[I_ASCAST]], ptr addrspace(4) [[PPTR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8 // CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(4) [[TMP0]], [[I_ASCAST]] -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[CMP]] to i8 -// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1 +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[CMP]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8 // CHECK-NEXT: store i32 66, ptr addrspace(4) [[TMP1]], align 4 // CHECK-NEXT: store i32 23, ptr addrspace(4) [[VAR23_ASCAST]], align 4 // CHECK-NEXT: store ptr addrspace(4) [[VAR23_ASCAST]], ptr addrspace(4) [[CP_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8 -// CHECK-NEXT: store i8 41, ptr addrspace(4) [[TMP3]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8 +// CHECK-NEXT: store i8 41, ptr addrspace(4) [[TMP2]], align 1 // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0 // CHECK-NEXT: store ptr addrspace(4) [[ARRAYDECAY]], ptr addrspace(4) [[CPP_ASCAST]], align 8 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8 -// CHECK-NEXT: store i8 43, ptr addrspace(4) [[TMP5]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8 +// CHECK-NEXT: store i8 43, ptr addrspace(4) [[TMP3]], align 1 // CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0 // CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY1]], i64 10 // CHECK-NEXT: store ptr addrspace(4) [[ADD_PTR]], ptr addrspace(4) [[APTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8 // CHECK-NEXT: [[ARRAYDECAY2:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0 -// CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168 -// CHECK-NEXT: [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP6]], [[ADD_PTR3]] +// CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168 +// CHECK-NEXT: [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP4]], [[ADD_PTR3]] // CHECK-NEXT: br i1 [[CMP4]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] // CHECK: if.then: -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8 -// CHECK-NEXT: store i32 44, ptr addrspace(4) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8 +// CHECK-NEXT: store i32 44, ptr addrspace(4) [[TMP5]], align 4 // CHECK-NEXT: br label [[IF_END]] // CHECK: if.end: // CHECK-NEXT: store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str to ptr addrspace(4)), ptr addrspace(4) [[STR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP8]], i64 0 -// CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1 -// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP9]] to i32 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP6]], i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1 +// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP7]] to i32 // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(4) [[I_ASCAST]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4 -// CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP10]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4 +// CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], 2 // CHECK-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK: cond.true: -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 // CHECK-NEXT: br label [[COND_END:%.*]] // CHECK: cond.false: // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: -// CHECK-NEXT: [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP11]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ] +// CHECK-NEXT: [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP9]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ] // CHECK-NEXT: store ptr addrspace(4) [[COND]], ptr addrspace(4) [[PHI_STR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4 -// CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], 2 -// CHECK-NEXT: [[TMP13:%.*]] = zext i1 [[CMP6]] to i64 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4 +// CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP10]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = zext i1 [[CMP6]] to i64 // CHECK-NEXT: [[COND7:%.*]] = select i1 [[CMP6]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), ptr addrspace(4) null // CHECK-NEXT: store ptr addrspace(4) [[COND7]], ptr addrspace(4) [[SELECT_NULL_ASCAST]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(4) [[TMP14]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[TMP12]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), ptr addrspace(4) [[SELECT_STR_TRIVIAL2_ASCAST]], align 8 // CHECK-NEXT: ret void // diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 9d202e0d04682..e4254d1e64bec 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -47,7 +47,7 @@ typedef unsigned long long uint64_t; // CHECK-NEXT: [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64 // CHECK-NEXT: [[ADD_I:%.*]] = add i64 [[MUL_I]], -48 // CHECK-NEXT: [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]] -// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I]] // CHECK: cleanup.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ] @@ -79,7 +79,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) { // CHECK-NEXT: [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64 // CHECK-NEXT: [[ADD_I:%.*]] = add i64 [[MUL_I]], -48 // CHECK-NEXT: [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]] -// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I]] // CHECK: cleanup.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ] @@ -120,7 +120,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) { // CHECK-NEXT: [[CONV25_I:%.*]] = zext nneg i8 [[TMP0]] to i64 // CHECK-NEXT: [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]] // CHECK-NEXT: [[ADD28_I:%.*]] = add i64 [[ADD26_I]], [[CONV25_I]] -// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I]] // CHECK: cleanup.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_END31_I]] ], [ [[__TAGP_ADDR_0_I]], [[IF_ELSE17_I]] ] @@ -141,7 +141,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { // CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48 // CHECK-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I:%.*]] // CHECK: if.then.i: -// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1 // CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]] // CHECK-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I:%.*]] [ // CHECK-NEXT: i8 120, label [[WHILE_COND_I30_I_PREHEADER:%.*]] @@ -173,7 +173,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { // CHECK-NEXT: [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 // CHECK-NEXT: [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]] // CHECK-NEXT: [[ADD28_I_I:%.*]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I36_I]] // CHECK: cleanup.i36.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I37_I]] = phi ptr [ [[INCDEC_PTR_I40_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I31_I]], [[IF_ELSE17_I_I]] ] @@ -195,7 +195,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { // CHECK-NEXT: [[CONV5_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 // CHECK-NEXT: [[ADD_I_I:%.*]] = add i64 [[MUL_I_I]], -48 // CHECK-NEXT: [[SUB_I_I:%.*]] = add i64 [[ADD_I_I]], [[CONV5_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I_I]] // CHECK: cleanup.i.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I_I]] = phi ptr [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[WHILE_BODY_I_I]] ] @@ -216,7 +216,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { // CHECK-NEXT: [[CONV5_I26_I:%.*]] = zext nneg i8 [[TMP8]] to i64 // CHECK-NEXT: [[ADD_I27_I:%.*]] = add i64 [[MUL_I25_I]], -48 // CHECK-NEXT: [[SUB_I28_I:%.*]] = add i64 [[ADD_I27_I]], [[CONV5_I26_I]] -// CHECK-NEXT: [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1 +// CHECK-NEXT: [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1 // CHECK-NEXT: br label [[CLEANUP_I20_I]] // CHECK: cleanup.i20.i: // CHECK-NEXT: [[__TAGP_ADDR_1_I21_I]] = phi ptr [ [[INCDEC_PTR_I29_I]], [[IF_THEN_I24_I]] ], [ [[__TAGP_ADDR_0_I15_I]], [[WHILE_BODY_I18_I]] ] @@ -2367,7 +2367,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // DEFAULT-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 // DEFAULT-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] // DEFAULT: if.then.i.i: -// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1 // DEFAULT-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] // DEFAULT-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ // DEFAULT-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] @@ -2399,7 +2399,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // DEFAULT-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 // DEFAULT-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] // DEFAULT-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I36_I_I]] // DEFAULT: cleanup.i36.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] @@ -2421,7 +2421,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // DEFAULT-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 // DEFAULT-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 // DEFAULT-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I_I_I]] // DEFAULT: cleanup.i.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] @@ -2442,7 +2442,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // DEFAULT-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 // DEFAULT-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 // DEFAULT-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I20_I_I]] // DEFAULT: cleanup.i20.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] @@ -2466,7 +2466,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // APPROX-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 // APPROX-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] // APPROX: if.then.i.i: -// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1 // APPROX-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] // APPROX-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ // APPROX-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] @@ -2498,7 +2498,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // APPROX-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 // APPROX-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] // APPROX-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I36_I_I]] // APPROX: cleanup.i36.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] @@ -2520,7 +2520,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // APPROX-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 // APPROX-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 // APPROX-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I_I_I]] // APPROX: cleanup.i.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] @@ -2541,7 +2541,7 @@ extern "C" __device__ double test_modf(double x, double* y) { // APPROX-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 // APPROX-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 // APPROX-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I20_I_I]] // APPROX: cleanup.i20.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] @@ -2565,7 +2565,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // DEFAULT-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 // DEFAULT-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] // DEFAULT: if.then.i.i: -// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1 // DEFAULT-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] // DEFAULT-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ // DEFAULT-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] @@ -2597,7 +2597,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // DEFAULT-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 // DEFAULT-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] // DEFAULT-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I36_I_I]] // DEFAULT: cleanup.i36.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] @@ -2619,7 +2619,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // DEFAULT-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 // DEFAULT-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 // DEFAULT-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I_I_I]] // DEFAULT: cleanup.i.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] @@ -2640,7 +2640,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // DEFAULT-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 // DEFAULT-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 // DEFAULT-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 // DEFAULT-NEXT: br label [[CLEANUP_I20_I_I]] // DEFAULT: cleanup.i20.i.i: // DEFAULT-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] @@ -2663,7 +2663,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // APPROX-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 // APPROX-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] // APPROX: if.then.i.i: -// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1 // APPROX-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] // APPROX-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ // APPROX-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] @@ -2695,7 +2695,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // APPROX-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 // APPROX-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] // APPROX-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I36_I_I]] // APPROX: cleanup.i36.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] @@ -2717,7 +2717,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // APPROX-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 // APPROX-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 // APPROX-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I_I_I]] // APPROX: cleanup.i.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] @@ -2738,7 +2738,7 @@ extern "C" __device__ float test_nanf(const char *tag) { // APPROX-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 // APPROX-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 // APPROX-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 // APPROX-NEXT: br label [[CLEANUP_I20_I_I]] // APPROX: cleanup.i20.i.i: // APPROX-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] @@ -3059,7 +3059,7 @@ extern "C" __device__ double test_normcdfinv(double x) { // DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] // DEFAULT-NEXT: [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // DEFAULT-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]] // DEFAULT: _ZL5normfiPKf.exit: @@ -3079,7 +3079,7 @@ extern "C" __device__ double test_normcdfinv(double x) { // FINITEONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]] // FINITEONLY-NEXT: [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]] -// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // FINITEONLY-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]] // FINITEONLY: _ZL5normfiPKf.exit: @@ -3099,7 +3099,7 @@ extern "C" __device__ double test_normcdfinv(double x) { // APPROX-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] // APPROX-NEXT: [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]] -// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // APPROX-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]] // APPROX: _ZL5normfiPKf.exit: @@ -3123,7 +3123,7 @@ extern "C" __device__ float test_normf(int x, const float *y) { // DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] // DEFAULT-NEXT: [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // DEFAULT-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]] // DEFAULT: _ZL4normiPKd.exit: @@ -3143,7 +3143,7 @@ extern "C" __device__ float test_normf(int x, const float *y) { // FINITEONLY-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]] // FINITEONLY-NEXT: [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]] -// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // FINITEONLY-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]] // FINITEONLY: _ZL4normiPKd.exit: @@ -3163,7 +3163,7 @@ extern "C" __device__ float test_normf(int x, const float *y) { // APPROX-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] // APPROX-NEXT: [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]] -// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // APPROX-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]] // APPROX: _ZL4normiPKd.exit: @@ -3483,7 +3483,7 @@ extern "C" __device__ double test_rint(double x) { // DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // DEFAULT-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] // DEFAULT-NEXT: [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // DEFAULT-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // DEFAULT: _ZL6rnormfiPKf.exit: @@ -3503,7 +3503,7 @@ extern "C" __device__ double test_rint(double x) { // FINITEONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // FINITEONLY-NEXT: [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]] // FINITEONLY-NEXT: [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]] -// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // FINITEONLY-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // FINITEONLY: _ZL6rnormfiPKf.exit: @@ -3523,7 +3523,7 @@ extern "C" __device__ double test_rint(double x) { // APPROX-NEXT: [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] // APPROX-NEXT: [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]] -// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4 +// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4 // APPROX-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] // APPROX: _ZL6rnormfiPKf.exit: @@ -3547,7 +3547,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // DEFAULT-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] // DEFAULT-NEXT: [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]] -// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// DEFAULT-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // DEFAULT-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // DEFAULT-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // DEFAULT: _ZL5rnormiPKd.exit: @@ -3567,7 +3567,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // FINITEONLY-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // FINITEONLY-NEXT: [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]] // FINITEONLY-NEXT: [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]] -// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// FINITEONLY-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // FINITEONLY-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // FINITEONLY-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // FINITEONLY: _ZL5rnormiPKd.exit: @@ -3587,7 +3587,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // APPROX-NEXT: [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]] // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] // APPROX-NEXT: [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]] -// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8 +// APPROX-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8 // APPROX-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 // APPROX-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] // APPROX: _ZL5rnormiPKd.exit: diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp index cb2e4e5b11e33..0789ef958e523 100644 --- a/clang/test/OpenMP/bug60602.cpp +++ b/clang/test/OpenMP/bug60602.cpp @@ -58,13 +58,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) { // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i64 0 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[N_ADDR]], align 4 // CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP8]] to i64 // CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[CONV]], 4 // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP11]], i64 0 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[N_ADDR]], align 4 // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP12]] to i64 // CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[CONV2]], 4 @@ -134,13 +134,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) { // CHECK-NEXT: [[TMP46:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP47:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP48:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i64 0 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i64 0 // CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[N_ADDR]], align 4 // CHECK-NEXT: [[CONV5:%.*]] = sext i32 [[TMP49]] to i64 // CHECK-NEXT: [[TMP50:%.*]] = mul nuw i64 [[CONV5]], 4 // CHECK-NEXT: [[TMP51:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 0 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP52]], i64 0 // CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[N_ADDR]], align 4 // CHECK-NEXT: [[CONV7:%.*]] = sext i32 [[TMP53]] to i64 // CHECK-NEXT: [[TMP54:%.*]] = mul nuw i64 [[CONV7]], 4 diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp index 52d5ceffa1471..d2954b7a74821 100644 --- a/clang/test/OpenMP/declare_mapper_codegen.cpp +++ b/clang/test/OpenMP/declare_mapper_codegen.cpp @@ -129,7 +129,7 @@ class C { // CK0-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1 // CK0-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1 // CK0-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]] -// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0 +// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0 // CK0-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1 // CK0-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64 // CK0-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64 @@ -965,7 +965,7 @@ class C { // CK4-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1 // CK4-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1 // CK4-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]] -// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0 +// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0 // CK4-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1 // CK4-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64 // CK4-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64 diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp index ea619cb6e0f26..6c588ba25db30 100644 --- a/clang/test/OpenMP/distribute_codegen.cpp +++ b/clang/test/OpenMP/distribute_codegen.cpp @@ -662,24 +662,24 @@ int fint(void) { return ftemplate(); } // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK1-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK1-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK1-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1574,21 +1574,21 @@ int fint(void) { return ftemplate(); } // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK3-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK3-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK3-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK3-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK3-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK3-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -2252,24 +2252,24 @@ int fint(void) { return ftemplate(); } // CHECK17-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK17-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK17-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK17-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK17-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK17-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK17-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK17-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK17-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK17-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK17-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK17-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK17-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK17-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK17: omp.body.continue: @@ -2790,21 +2790,21 @@ int fint(void) { return ftemplate(); } // CHECK19-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]] -// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK19-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]] -// CHECK19-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK19-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK19-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK19-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]] -// CHECK19-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK19-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK19-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK19-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]] -// CHECK19-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK19-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK19-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK19-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK19: omp.body.continue: diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp index b019b4ff92ad5..93a6779ac02e8 100644 --- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp @@ -175,16 +175,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]] // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] @@ -214,7 +214,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP24]], align 8 // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -229,19 +229,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0 // CHECK1-NEXT: [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0 // CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]] // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9 // CHECK1-NEXT: [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8 @@ -562,9 +562,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp index f7353172e235c..ad93fd6030ac7 100644 --- a/clang/test/OpenMP/distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_codegen.cpp @@ -706,24 +706,24 @@ int fint(void) { return ftemplate(); } // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK1-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK1-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK1-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP17]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1682,21 +1682,21 @@ int fint(void) { return ftemplate(); } // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK3-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] -// CHECK3-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK3-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK3-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] -// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK3-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] -// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK3-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -2664,24 +2664,24 @@ int fint(void) { return ftemplate(); } // CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK5-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK5-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK5-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK5-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK5-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK5-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -3671,21 +3671,21 @@ int fint(void) { return ftemplate(); } // CHECK7-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK7-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK7-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK7-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK7-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK7-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK7-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK7-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK7-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK7-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK7-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK7-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK7-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK7: omp.body.continue: @@ -4290,24 +4290,24 @@ int fint(void) { return ftemplate(); } // CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]] +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]] // CHECK9-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK9-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]] +// CHECK9-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]] // CHECK9-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]] // CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64 -// CHECK9-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]] +// CHECK9-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]] // CHECK9-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]] // CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK9-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]] +// CHECK9-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]] // CHECK9-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -4606,21 +4606,21 @@ int fint(void) { return ftemplate(); } // CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]] +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]] // CHECK11-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] -// CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]] +// CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]] // CHECK11-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]] // CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] -// CHECK11-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]] +// CHECK11-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]] // CHECK11-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]] // CHECK11-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] -// CHECK11-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]] +// CHECK11-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]] // CHECK11-NEXT: store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK11: omp.body.continue: @@ -4928,24 +4928,24 @@ int fint(void) { return ftemplate(); } // CHECK13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]] +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]] // CHECK13-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK13-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]] +// CHECK13-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]] // CHECK13-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]] // CHECK13-NEXT: [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64 -// CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]] +// CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]] // CHECK13-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]] // CHECK13-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK13-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]] +// CHECK13-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]] // CHECK13-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP10]] // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5275,21 +5275,21 @@ int fint(void) { return ftemplate(); } // CHECK15-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]] +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]] // CHECK15-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK15-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]] +// CHECK15-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]] // CHECK15-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]] // CHECK15-NEXT: [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]] +// CHECK15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]] // CHECK15-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]] // CHECK15-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]] -// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]] +// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]] // CHECK15-NEXT: store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]] // CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK15: omp.body.continue: @@ -5782,24 +5782,24 @@ int fint(void) { return ftemplate(); } // CHECK17-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK17-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK17-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK17-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK17-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK17-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK17-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK17-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK17-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK17-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK17-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK17-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK17-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK17-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK17: omp.body.continue: @@ -6373,21 +6373,21 @@ int fint(void) { return ftemplate(); } // CHECK19-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK19-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK19-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK19-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK19-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK19-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK19-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK19-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK19-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK19-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] -// CHECK19-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK19-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK19-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK19-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK19: omp.body.continue: @@ -6970,24 +6970,24 @@ int fint(void) { return ftemplate(); } // CHECK21-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK21-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK21-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK21-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK21-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK21-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK21-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK21-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK21-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK21-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK21-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK21-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK21-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK21-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK21-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP19]] // CHECK21-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK21: omp.body.continue: @@ -7592,21 +7592,21 @@ int fint(void) { return ftemplate(); } // CHECK23-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK23-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]] +// CHECK23-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]] // CHECK23-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK23-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]] +// CHECK23-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]] // CHECK23-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK23-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK23-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]] +// CHECK23-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]] // CHECK23-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]] // CHECK23-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK23-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]] +// CHECK23-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]] // CHECK23-NEXT: store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP20]] // CHECK23-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK23: omp.body.continue: diff --git a/clang/test/OpenMP/for_linear_codegen.cpp b/clang/test/OpenMP/for_linear_codegen.cpp index 395ccdbeed763..5a21fe8509fd3 100644 --- a/clang/test/OpenMP/for_linear_codegen.cpp +++ b/clang/test/OpenMP/for_linear_codegen.cpp @@ -650,7 +650,7 @@ int main() { // CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP16]], [[MUL9]] // CHECK1-NEXT: store i32 [[ADD10]], ptr [[LVAR5]], align 4 // CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[PVAR4]], align 8 -// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1 +// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 1 // CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[PVAR4]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP6]], align 8 // CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp index ea32e98bf1423..83632db238484 100644 --- a/clang/test/OpenMP/for_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_reduction_codegen.cpp @@ -1021,14 +1021,14 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]] -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0 // CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 // CHECK1-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]] // CHECK1-NEXT: [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] @@ -1054,16 +1054,16 @@ int main() { // CHECK1-NEXT: [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]] // CHECK1-NEXT: [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64) // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1 // CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX8]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1 // CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 // CHECK1-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN11:%.*]] = add nsw i64 0, [[TMP23]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]] // CHECK1-NEXT: [[ARRAYDECAY13:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX12]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2 +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2 // CHECK1-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64 // CHECK1-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX9]] to i64 // CHECK1-NEXT: [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]] @@ -1580,10 +1580,10 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP3]] -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP4]] -// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX4]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX4]], i64 1 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr [[ARR6]], i32 0, i32 0, i32 0 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 2 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]] @@ -1757,13 +1757,13 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6 // CHECK1-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64 // CHECK1-NEXT: [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]] @@ -1963,11 +1963,11 @@ int main() { // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]] @@ -2148,13 +2148,13 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]] @@ -2335,13 +2335,13 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP2]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 1 // CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR24]]) // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8 @@ -2459,8 +2459,8 @@ int main() { // CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[VVAR22]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 5 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]] @@ -2641,9 +2641,9 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]] @@ -2826,9 +2826,9 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]] @@ -3012,9 +3012,9 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3 // CHECK1-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK1-NEXT: [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]] @@ -3974,8 +3974,8 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S.0], ptr [[ARR4]], i32 0, i32 0 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 40 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]] diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp index 16d6c23542fce..82f94c949eea6 100644 --- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp +++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp @@ -1074,14 +1074,14 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]] -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0 // CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 // CHECK1-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]] // CHECK1-NEXT: [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]] // CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] @@ -1109,16 +1109,16 @@ int main() { // CHECK1-NEXT: [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]] // CHECK1-NEXT: [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64) // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1 // CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX9]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1 // CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 // CHECK1-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN12:%.*]] = add nsw i64 0, [[TMP23]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]] // CHECK1-NEXT: [[ARRAYDECAY14:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX13]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2 +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2 // CHECK1-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX15]] to i64 // CHECK1-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX10]] to i64 // CHECK1-NEXT: [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]] @@ -1669,13 +1669,13 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[TMP4]], i64 6 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP4]], i64 6 // CHECK1-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64 // CHECK1-NEXT: [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]] @@ -1877,8 +1877,8 @@ int main() { // CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[VVAR22]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 5 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]] @@ -2066,9 +2066,9 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[VAR34]], i32 0, i32 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]] @@ -2979,8 +2979,8 @@ int main() { // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S], ptr [[ARR4]], i32 0, i32 0 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 40 // CHECK1-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]] diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp index ea93323de77d0..b875279c2a144 100644 --- a/clang/test/OpenMP/for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp @@ -68,16 +68,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]] // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]] @@ -107,7 +107,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP23]], align 8 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -122,19 +122,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP28]], align 8 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0 // CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP34:%.*]] = sext i32 [[TMP33]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP34]] // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9 // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP30]], align 8 // CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP37]], align 8 @@ -459,9 +459,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/for_scan_codegen.cpp b/clang/test/OpenMP/for_scan_codegen.cpp index 4cf18a76fbfef..61e6534db471e 100644 --- a/clang/test/OpenMP/for_scan_codegen.cpp +++ b/clang/test/OpenMP/for_scan_codegen.cpp @@ -39,13 +39,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -72,13 +72,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -132,13 +132,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE:[^,]+]] @@ -179,13 +179,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -217,13 +217,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -280,13 +280,13 @@ void baz(int n) { // CHECK: [[IF_THEN]]: // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE]] diff --git a/clang/test/OpenMP/for_simd_scan_codegen.cpp b/clang/test/OpenMP/for_simd_scan_codegen.cpp index 29af5f74c5b5b..829f2656042fb 100644 --- a/clang/test/OpenMP/for_simd_scan_codegen.cpp +++ b/clang/test/OpenMP/for_simd_scan_codegen.cpp @@ -39,13 +39,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -72,13 +72,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -132,13 +132,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE:[^,]+]] @@ -179,13 +179,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -217,13 +217,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -280,13 +280,13 @@ void baz(int n) { // CHECK: [[IF_THEN]]: // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE]] diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp index 0098a7db575c3..ec1c3af744b49 100644 --- a/clang/test/OpenMP/irbuilder_for_iterator.cpp +++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp @@ -78,18 +78,18 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) { // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP11]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP9]], [[TMP12]] // CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM4]] // CHECK-NEXT: store float [[MUL]], ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp index 45b34621afbb9..86a043e638bc3 100644 --- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp +++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp @@ -94,18 +94,18 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) { // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP12]], [[TMP15]] // CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM4]] // CHECK-NEXT: store float [[MUL]], ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c index b0043b823ac85..675871a87b3bd 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned.c @@ -65,24 +65,24 @@ extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) { // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]] // CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]] // CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP18]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c index 19b2770bfa2df..39ede3ef971d0 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c @@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_auto(float *a, float *b, float *c, float // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]] // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]] // CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_down.c b/clang/test/OpenMP/irbuilder_for_unsigned_down.c index 6e179826a6efa..5515f086c34a7 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_down.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_down.c @@ -67,7 +67,7 @@ extern "C" void workshareloop_unsigned_down(float *a) { // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]] // CHECK-NEXT: store float [[CONV]], ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c index 8f3297061938b..f20b60e608d2f 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c @@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic(float *a, float *b, float *c, flo // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]] // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]] // CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c index c2b0948bf7aeb..599f256243b11 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c @@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic_chunked(float *a, float *b, float // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]] // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]] // CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c index 68becf9f694ad..c27bcba155910 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c @@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_runtime(float *a, float *b, float *c, flo // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]] // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]] // CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c index 71fb6b5473da8..b937568ca9f11 100644 --- a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c +++ b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c @@ -108,24 +108,24 @@ extern "C" void workshareloop_unsigned_static_chunked(float *a, float *b, float // CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM]] // CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM2]] // CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP19]], [[TMP22]] // CHECK-NEXT: [[TMP23:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM4]] // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP25]] // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP27]] to i64 -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[IDXPROM7]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP26]], i64 [[IDXPROM7]] // CHECK-NEXT: store float [[MUL6]], ptr [[ARRAYIDX8]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_INC]] // CHECK: omp_loop.inc: diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp index d5b22d8ff2a4d..a52ddad465f37 100644 --- a/clang/test/OpenMP/map_struct_ordering.cpp +++ b/clang/test/OpenMP/map_struct_ordering.cpp @@ -57,7 +57,7 @@ int map_struct() { // CHECK-NEXT: [[DATUM:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0 // CHECK-NEXT: [[DATUM2:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DATUM2]], align 8 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 1 // CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 // CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[DAT]] to i64 diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp index e90f0783787c0..7d467293d0c8f 100644 --- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp @@ -76,7 +76,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -91,7 +91,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -106,7 +106,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -124,7 +124,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP26]], align 8 // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp index 180ff3a94d24c..b0652c843845c 100644 --- a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp @@ -84,9 +84,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -138,10 +138,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp index 2061da7c4d781..b0d00c5f539b1 100644 --- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp @@ -76,7 +76,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -91,7 +91,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -106,7 +106,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -124,7 +124,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP26]], align 8 // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp index a69844dc4dee2..7def61251b24e 100644 --- a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp @@ -80,9 +80,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -134,10 +134,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp index 0a73eaefc9808..67285cfaef34d 100644 --- a/clang/test/OpenMP/ordered_codegen.cpp +++ b/clang/test/OpenMP/ordered_codegen.cpp @@ -255,21 +255,21 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]] // CHECK1-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]] // CHECK1-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 // CHECK1-NEXT: [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]] // CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]] // CHECK1-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]] // CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]] // CHECK1-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4 // CHECK1-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -485,24 +485,24 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-NEXT: [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]] // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 // CHECK1-NEXT: [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]] // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-NEXT: [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 // CHECK1-NEXT: [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-NEXT: [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64 -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]] // CHECK1-NEXT: store float [[MUL12]], ptr [[ARRAYIDX14]], align 4 // CHECK1-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -866,21 +866,21 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) // CHECK1-IRBUILDER-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP6:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]] // CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]] // CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]] // CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK1-IRBUILDER-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]] // CHECK1-IRBUILDER-NEXT: [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK1-IRBUILDER-NEXT: store float [[MUL7]], ptr [[ARRAYIDX8]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: @@ -1110,24 +1110,24 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-IRBUILDER-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]] // CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-IRBUILDER-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]] // CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]] // CHECK1-IRBUILDER-NEXT: [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-IRBUILDER-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]] // CHECK1-IRBUILDER-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]] // CHECK1-IRBUILDER-NEXT: [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK1-IRBUILDER-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1 // CHECK1-IRBUILDER-NEXT: [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64 -// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]] +// CHECK1-IRBUILDER-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]] // CHECK1-IRBUILDER-NEXT: store float [[MUL14]], ptr [[ARRAYIDX16]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: @@ -1495,21 +1495,21 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK3-NEXT: [[TMP7:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]] // CHECK3-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK3-NEXT: [[TMP10:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] +// CHECK3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]] // CHECK3-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 // CHECK3-NEXT: [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]] // CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK3-NEXT: [[TMP13:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]] +// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]] // CHECK3-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK3-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]] // CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK3-NEXT: [[TMP16:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] +// CHECK3-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]] // CHECK3-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4 // CHECK3-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -1725,24 +1725,24 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]] // CHECK3-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-NEXT: [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64 -// CHECK3-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]] +// CHECK3-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]] // CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 // CHECK3-NEXT: [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]] // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK3-NEXT: [[TMP15:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-NEXT: [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64 -// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]] +// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]] // CHECK3-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 // CHECK3-NEXT: [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]] // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK3-NEXT: [[TMP18:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-NEXT: [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64 -// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]] +// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]] // CHECK3-NEXT: store float [[MUL12]], ptr [[ARRAYIDX14]], align 4 // CHECK3-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -2106,21 +2106,21 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) // CHECK3-IRBUILDER-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP6:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]] // CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]] // CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]] // CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK3-IRBUILDER-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]] // CHECK3-IRBUILDER-NEXT: [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK3-IRBUILDER-NEXT: store float [[MUL7]], ptr [[ARRAYIDX8]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: @@ -2350,24 +2350,24 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-IRBUILDER-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]] // CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-IRBUILDER-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]] // CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]] // CHECK3-IRBUILDER-NEXT: [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-IRBUILDER-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]] // CHECK3-IRBUILDER-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]] // CHECK3-IRBUILDER-NEXT: [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK3-IRBUILDER-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1 // CHECK3-IRBUILDER-NEXT: [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64 -// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]] +// CHECK3-IRBUILDER-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]] // CHECK3-IRBUILDER-NEXT: store float [[MUL14]], ptr [[ARRAYIDX16]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: @@ -2674,21 +2674,21 @@ void foo_simd(int low, int up) { // CHECK5: for.body: // CHECK5-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK5-NEXT: [[TMP2:%.*]] = load i64, ptr [[I]], align 8 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 [[TMP2]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i64 [[TMP2]] // CHECK5-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK5-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK5-NEXT: [[TMP5:%.*]] = load i64, ptr [[I]], align 8 -// CHECK5-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[TMP5]] +// CHECK5-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[TMP5]] // CHECK5-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 // CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], [[TMP6]] // CHECK5-NEXT: [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK5-NEXT: [[TMP8:%.*]] = load i64, ptr [[I]], align 8 -// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[TMP8]] +// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP8]] // CHECK5-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 // CHECK5-NEXT: [[MUL3:%.*]] = fmul float [[MUL]], [[TMP9]] // CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK5-NEXT: [[TMP11:%.*]] = load i64, ptr [[I]], align 8 -// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] +// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[TMP11]] // CHECK5-NEXT: store float [[MUL3]], ptr [[ARRAYIDX4]], align 4 // CHECK5-NEXT: br label [[FOR_INC:%.*]] // CHECK5: for.inc: @@ -2804,24 +2804,24 @@ void foo_simd(int low, int up) { // CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, ptr [[I]], align 1 // CHECK5-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP3]] to i64 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 [[IDXPROM]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i64 [[IDXPROM]] // CHECK5-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK5-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8 // CHECK5-NEXT: [[TMP6:%.*]] = load i8, ptr [[I]], align 1 // CHECK5-NEXT: [[IDXPROM4:%.*]] = zext i8 [[TMP6]] to i64 -// CHECK5-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[IDXPROM4]] +// CHECK5-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[IDXPROM4]] // CHECK5-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP4]], [[TMP7]] // CHECK5-NEXT: [[TMP8:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK5-NEXT: [[TMP9:%.*]] = load i8, ptr [[I]], align 1 // CHECK5-NEXT: [[IDXPROM6:%.*]] = zext i8 [[TMP9]] to i64 -// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM6]] +// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM6]] // CHECK5-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 // CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[MUL]], [[TMP10]] // CHECK5-NEXT: [[TMP11:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK5-NEXT: [[TMP12:%.*]] = load i8, ptr [[I]], align 1 // CHECK5-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP12]] to i64 -// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM9]] +// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM9]] // CHECK5-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK5-NEXT: br label [[FOR_INC:%.*]] // CHECK5: for.inc: diff --git a/clang/test/OpenMP/parallel_for_codegen.cpp b/clang/test/OpenMP/parallel_for_codegen.cpp index 2dec32d71b91a..c7afae419509b 100644 --- a/clang/test/OpenMP/parallel_for_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_codegen.cpp @@ -665,24 +665,24 @@ void range_for_collapsed() { // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK1-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8 // CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK1-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK1-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -779,21 +779,21 @@ void range_for_collapsed() { // CHECK1-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK1-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK1-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -882,21 +882,21 @@ void range_for_collapsed() { // CHECK1-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK1-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK1-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK1-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1159,24 +1159,24 @@ void range_for_collapsed() { // CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]] // CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]] // CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]] // CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]] // CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]] // CHECK1-NEXT: store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1303,7 +1303,7 @@ void range_for_collapsed() { // CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[CALL]] to float // CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]] // CHECK1-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4 @@ -1312,7 +1312,7 @@ void range_for_collapsed() { // CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]] // CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 // CHECK1-NEXT: [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]] // CHECK1-NEXT: store float [[ADD9]], ptr [[ARRAYIDX8]], align 4 @@ -1781,24 +1781,24 @@ void range_for_collapsed() { // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8 // CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK2-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8 // CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK2-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK2-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8 // CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK2-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 // CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK2-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -1895,21 +1895,21 @@ void range_for_collapsed() { // CHECK2-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK2-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK2-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK2-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -1998,21 +1998,21 @@ void range_for_collapsed() { // CHECK2-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK2-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK2-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK2-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -2275,24 +2275,24 @@ void range_for_collapsed() { // CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]] // CHECK2-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64 -// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]] +// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]] // CHECK2-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]] // CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64 -// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]] +// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]] // CHECK2-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]] // CHECK2-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64 -// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]] +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]] // CHECK2-NEXT: store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -2419,7 +2419,7 @@ void range_for_collapsed() { // CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[CALL]] to float // CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]] // CHECK2-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK2-NEXT: [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]] // CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4 @@ -2428,7 +2428,7 @@ void range_for_collapsed() { // CHECK2-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4 // CHECK2-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]] +// CHECK2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]] // CHECK2-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 // CHECK2-NEXT: [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]] // CHECK2-NEXT: store float [[ADD9]], ptr [[ARRAYIDX8]], align 4 @@ -2897,24 +2897,24 @@ void range_for_collapsed() { // CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45:![0-9]+]] // CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64, !dbg [[DBG45]] -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]] // CHECK5-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG45]] // CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64, !dbg [[DBG45]] -// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]] +// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]] // CHECK5-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]], !dbg [[DBG45]] // CHECK5-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG45]] // CHECK5-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64, !dbg [[DBG45]] -// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]] +// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]] // CHECK5-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]], !dbg [[DBG45]] // CHECK5-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG45]] // CHECK5-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64, !dbg [[DBG45]] -// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]] +// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]] // CHECK5-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !dbg [[DBG45]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG46:![0-9]+]] // CHECK5: omp.body.continue: @@ -3011,21 +3011,21 @@ void range_for_collapsed() { // CHECK5-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG54]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG56:![0-9]+]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]] // CHECK5-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] -// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]] +// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]] // CHECK5-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG56]] // CHECK5-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] -// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]] +// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]] // CHECK5-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG56]] // CHECK5-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] -// CHECK5-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]] +// CHECK5-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]] // CHECK5-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG57:![0-9]+]] // CHECK5: omp.body.continue: @@ -3114,21 +3114,21 @@ void range_for_collapsed() { // CHECK5-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG66]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG68:![0-9]+]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]] // CHECK5-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] -// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]] +// CHECK5-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]] // CHECK5-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG68]] // CHECK5-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] -// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]] +// CHECK5-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]] // CHECK5-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG68]] // CHECK5-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] -// CHECK5-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]] +// CHECK5-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]] // CHECK5-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG69:![0-9]+]] // CHECK5: omp.body.continue: @@ -3391,24 +3391,24 @@ void range_for_collapsed() { // CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG97:![0-9]+]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64, !dbg [[DBG97]] -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]] // CHECK5-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64, !dbg [[DBG97]] -// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]] +// CHECK5-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]] // CHECK5-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]], !dbg [[DBG97]] // CHECK5-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64, !dbg [[DBG97]] -// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]] +// CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]] // CHECK5-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]], !dbg [[DBG97]] // CHECK5-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64, !dbg [[DBG97]] -// CHECK5-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]] +// CHECK5-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]] // CHECK5-NEXT: store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG98:![0-9]+]] // CHECK5: omp.body.continue: @@ -3535,7 +3535,7 @@ void range_for_collapsed() { // CHECK5-NEXT: [[CONV:%.*]] = sitofp i32 [[CALL]] to float, !dbg [[DBG111]] // CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]] // CHECK5-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64, !dbg [[DBG111]] -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]] +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]] // CHECK5-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG111]] // CHECK5-NEXT: [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]], !dbg [[DBG111]] // CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !dbg [[DBG111]] @@ -3544,7 +3544,7 @@ void range_for_collapsed() { // CHECK5-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG111]] // CHECK5-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]] // CHECK5-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64, !dbg [[DBG111]] -// CHECK5-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]] +// CHECK5-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]] // CHECK5-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]] // CHECK5-NEXT: [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]], !dbg [[DBG111]] // CHECK5-NEXT: store float [[ADD9]], ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]] @@ -4013,24 +4013,24 @@ void range_for_collapsed() { // CHECK6-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8 // CHECK6-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]] +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]] // CHECK6-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK6-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8 // CHECK6-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]] +// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]] // CHECK6-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK6-NEXT: [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]] // CHECK6-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8 // CHECK6-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK6-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]] +// CHECK6-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]] // CHECK6-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 // CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]] // CHECK6-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK6-NEXT: [[TMP24:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK6-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]] +// CHECK6-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]] // CHECK6-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK6: omp.body.continue: @@ -4127,21 +4127,21 @@ void range_for_collapsed() { // CHECK6-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK6-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK6-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK6-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK6-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK6-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK6-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK6-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]] -// CHECK6-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK6-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK6-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK6: omp.body.continue: @@ -4230,21 +4230,21 @@ void range_for_collapsed() { // CHECK6-NEXT: store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]] +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]] // CHECK6-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK6-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]] +// CHECK6-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]] // CHECK6-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]] // CHECK6-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]] +// CHECK6-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]] // CHECK6-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]] // CHECK6-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]] -// CHECK6-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +// CHECK6-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]] // CHECK6-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK6: omp.body.continue: @@ -4507,24 +4507,24 @@ void range_for_collapsed() { // CHECK6-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]] +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]] // CHECK6-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64 -// CHECK6-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]] +// CHECK6-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]] // CHECK6-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]] // CHECK6-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64 -// CHECK6-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]] +// CHECK6-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]] // CHECK6-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]] // CHECK6-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64 -// CHECK6-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]] +// CHECK6-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]] // CHECK6-NEXT: store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK6: omp.body.continue: @@ -4649,7 +4649,7 @@ void range_for_collapsed() { // CHECK6-NEXT: [[CONV:%.*]] = sitofp i32 [[CALL]] to float // CHECK6-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]] +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]] // CHECK6-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 // CHECK6-NEXT: [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]] // CHECK6-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4 @@ -4658,7 +4658,7 @@ void range_for_collapsed() { // CHECK6-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK6-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4 // CHECK6-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK6-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]] +// CHECK6-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]] // CHECK6-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 // CHECK6-NEXT: [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]] // CHECK6-NEXT: store float [[ADD9]], ptr [[ARRAYIDX8]], align 4 diff --git a/clang/test/OpenMP/parallel_for_linear_codegen.cpp b/clang/test/OpenMP/parallel_for_linear_codegen.cpp index 8b46797ae253f..15eb0dfa42af5 100644 --- a/clang/test/OpenMP/parallel_for_linear_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_linear_codegen.cpp @@ -337,7 +337,7 @@ int main() { // CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP14]], [[MUL6]] // CHECK1-NEXT: store i32 [[ADD7]], ptr [[LVAR3]], align 4 // CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[PVAR2]], align 8 -// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1 +// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 1 // CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[PVAR2]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[LVAR3]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP17]], 1 diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp index 752aec788bf34..59d169d7a1738 100644 --- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp @@ -83,16 +83,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -122,7 +122,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -137,19 +137,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8 @@ -470,9 +470,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp index 161534814a793..67b32407c712f 100644 --- a/clang/test/OpenMP/parallel_for_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp @@ -28,9 +28,9 @@ void baz(int n) { // CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call( // CHECK: [[LAST:%.+]] = mul nsw i64 9, % - // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[LAST]] + // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[LAST]] // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 @_ZZ3baziE1a, ptr align 4 [[LAST_REF]], i64 %{{.+}}, i1 false) - // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 9 + // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 9 // CHECK: [[LAST_VAL:%.+]] = load double, ptr [[LAST_REF_B]], // CHECK: store double [[LAST_VAL]], ptr @_ZZ3baziE1b, @@ -58,13 +58,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -91,13 +91,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -151,13 +151,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE:[^,]+]] @@ -188,13 +188,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -226,13 +226,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -289,13 +289,13 @@ void baz(int n) { // CHECK: [[IF_THEN]]: // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE]] diff --git a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp index 7e973a602a65c..cac997753d480 100644 --- a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp @@ -51,13 +51,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -84,13 +84,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -144,13 +144,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE:[^,]+]] @@ -181,13 +181,13 @@ void baz(int n) { // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -219,13 +219,13 @@ void baz(int n) { // a_buffer[i] += a_buffer[i-pow(2, k)]; // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]] + // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]] // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]] - // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] + // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]] // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]] // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]] // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]] @@ -282,13 +282,13 @@ void baz(int n) { // CHECK: [[IF_THEN]]: // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1 // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]] - // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]] + // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false) // b_priv = b_buffer[i]; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]] // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]], // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]], // CHECK: br label %[[SCAN_PHASE]] diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp index b17757a5f978a..c1fe00f238001 100644 --- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp @@ -72,16 +72,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -111,7 +111,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -126,19 +126,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -425,9 +425,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp index 3c74aaca3f46d..1d106922435d5 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp @@ -84,9 +84,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -138,10 +138,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N:%.+]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp index 7d4216ddde6a3..9a524c3b94c6e 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp @@ -84,9 +84,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -138,10 +138,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N:%.+]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp index f49faa6b89deb..ce76429b871fe 100644 --- a/clang/test/OpenMP/parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp @@ -354,9 +354,9 @@ int main() { // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK1-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64 // CHECK1-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK1-NEXT: [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]] @@ -1632,9 +1632,9 @@ int main() { // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK3-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK3-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0 +// CHECK3-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0 // CHECK3-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64 // CHECK3-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK3-NEXT: [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]] @@ -2142,9 +2142,9 @@ int main() { // CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 // CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0 // CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8 -// CHECK4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0 +// CHECK4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0 // CHECK4-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64 // CHECK4-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK4-NEXT: [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]] diff --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp index 208f7a41aa3db..40cc3103b1c0f 100644 --- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp @@ -72,16 +72,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -111,7 +111,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -126,19 +126,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -416,9 +416,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp index 6d73652c3ea27..61597a074cf59 100644 --- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp @@ -81,16 +81,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -120,7 +120,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -135,19 +135,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -458,9 +458,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp index 4d2b93ffd4712..a7db3da7d1f86 100644 --- a/clang/test/OpenMP/reduction_implicit_map.cpp +++ b/clang/test/OpenMP/reduction_implicit_map.cpp @@ -133,9 +133,9 @@ int main() // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 0 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 0 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i64 0 // CHECK-NEXT: store double 0.000000e+00, ptr [[E2]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[E_ADDR]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64 @@ -529,16 +529,16 @@ int main() // CHECK1-NEXT: store i64 9, ptr [[DOTOMP_UB]], align 8 // CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0 // CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY]], i64 2 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY]], i64 2 // CHECK1-NEXT: [[ARRAYDECAY2:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX1]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY2]], i64 1 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY2]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1 // CHECK1-NEXT: [[ARRAYDECAY5:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX4]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY5]], i64 5 +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY5]], i64 5 // CHECK1-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX6]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY7]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY7]], i64 1 // CHECK1-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64 // CHECK1-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64 // CHECK1-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], [[TMP2]] @@ -564,18 +564,18 @@ int main() // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] // CHECK1-NEXT: [[TMP12:%.*]] = sdiv exact i64 [[TMP11]], ptrtoint (ptr getelementptr (double, ptr null, i32 1) to i64) // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[VLA]], i64 [[TMP12]] -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0 // CHECK1-NEXT: [[ARRAYDECAY10:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX9]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY10]], i64 2 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY10]], i64 2 // CHECK1-NEXT: [[ARRAYDECAY12:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX11]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY12]], i64 1 -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY12]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1 // CHECK1-NEXT: [[ARRAYDECAY15:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX14]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY15]], i64 5 +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY15]], i64 5 // CHECK1-NEXT: [[ARRAYDECAY17:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX16]], i64 0, i64 0 -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY17]], i64 1 +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY17]], i64 1 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX13]], ptr [[TMP15]], align 8 @@ -852,7 +852,7 @@ int main() // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 4 // CHECK2-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 @@ -930,10 +930,10 @@ int main() // CHECK2-NEXT: [[TMP46:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4 // CHECK2-NEXT: [[TMP47:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 // CHECK2-NEXT: [[TMP48:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 0 +// CHECK2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i32 0 // CHECK2-NEXT: [[TMP49:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4 // CHECK2-NEXT: [[TMP50:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 0 +// CHECK2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP50]], i32 0 // CHECK2-NEXT: [[TMP51:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4 // CHECK2-NEXT: [[TMP52:%.*]] = mul nuw i32 [[TMP51]], 4 // CHECK2-NEXT: [[TMP53:%.*]] = sext i32 [[TMP52]] to i64 @@ -1007,7 +1007,7 @@ int main() // CHECK2-NEXT: [[TMP86:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4 // CHECK2-NEXT: store i32 [[TMP86]], ptr [[SIZE_CASTED21]], align 4 // CHECK2-NEXT: [[TMP87:%.*]] = load i32, ptr [[SIZE_CASTED21]], align 4 -// CHECK2-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[A]], i32 0, i32 0 +// CHECK2-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[A]], i32 0, i32 0 // CHECK2-NEXT: [[TMP88:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0 // CHECK2-NEXT: store i32 [[TMP87]], ptr [[TMP88]], align 4 // CHECK2-NEXT: [[TMP89:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0 @@ -1480,9 +1480,9 @@ int main() // CHECK2-NEXT: store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 4 // CHECK2-NEXT: store ptr [[INPUT]], ptr [[INPUT_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 0 // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2 +// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 2 // CHECK2-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT2]], i32 0, i32 0 // CHECK2-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3 // CHECK2-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP2]] @@ -1664,9 +1664,9 @@ int main() // CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 0 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2 +// CHECK2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 2 // CHECK2-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT4]], i32 0, i32 0 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3 // CHECK2-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP8]] @@ -1878,8 +1878,8 @@ int main() // CHECK2-NEXT: store i32 [[SIZE]], ptr [[SIZE_ADDR]], align 4 // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 0 -// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 1 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 0 +// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 1 // CHECK2-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[A2]], i32 0, i32 0 // CHECK2-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 2 // CHECK2-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]] diff --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp index 1a2cf7aede321..5d749eeb81776 100644 --- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp @@ -82,16 +82,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]] // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]] @@ -121,7 +121,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP23]], align 8 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -136,19 +136,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP28]], align 8 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0 // CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP34:%.*]] = sext i32 [[TMP33]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP34]] // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9 // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP30]], align 8 // CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP37]], align 8 @@ -463,9 +463,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp index d912f801a33db..19a523ee165c8 100644 --- a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp +++ b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp @@ -54,12 +54,12 @@ int main() { // CHECK: [[SIZES:%.+]] = alloca [6 x i64], // CHECK: [[VLA_ADDR:%.+]] = alloca float, i64 %{{.+}}, // CHECK: [[PTR:%.+]] = load ptr, ptr [[PTR_ADDR]], -// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds float, ptr [[PTR]], i64 3 +// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[PTR]], i64 3 // CHECK: [[P5:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[ARR_IDX1:%.+]] = getelementptr inbounds float, ptr [[P5]], i64 0 // CHECK: [[P7:%.+]] = load ptr, ptr [[REF_ADDR]], // CHECK-NEXT: [[REF:%.+]] = load ptr, ptr [[REF_ADDR]], -// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0 +// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds nuw [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0 // CHECK: [[P10:%.+]] = mul nuw i64 {{.+}}, 4 // CHECK-NEXT: [[ARR_IDX5:%.+]] = getelementptr inbounds float, ptr [[VLA_ADDR]], i64 0 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SIZES]], ptr align 8 [[SIZES1]], i64 48, i1 false) @@ -132,14 +132,14 @@ int main() { // CHECK: [[SIZES:%.+]] = alloca [6 x i64], // CHECK: [[A_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS:%.+]], i32 0, i32 0 // CHECK: [[PTR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1 -// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds i32, ptr %{{.+}}, i64 3 +// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds nuw i32, ptr %{{.+}}, i64 3 // CHECK: [[REF_REF:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 2 // CHECK: [[REF_PTR:%.+]] = load ptr, ptr [[REF_REF]], // CHECK-NEXT: [[P3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1 // CHECK: [[ARR_IDX5:%.+]] = getelementptr inbounds i32, ptr {{.+}}, i64 0 // CHECK: [[ARR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3 -// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0 +// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0 // CHECK: [[A_ADDR2:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0 // CHECK: [[P4:%.+]] = mul nuw i64 [[CONV:%.+]], 4 // CHECK: [[A_ADDR3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0 @@ -147,7 +147,7 @@ int main() { // CHECK: [[L6:%.+]] = sext i32 [[L5]] to i64 // CHECK: [[LB_ADD_LEN:%lb_add_len]] = add nsw i64 -1, [[L6]] // CHECK: [[ARR_ADDR9:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3 -// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len +// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len // CHECK: [[ARR_END:%.+]] = getelementptr i32, ptr [[ARR_IDX10]], i32 1 // CHECK: [[E:%.+]] = ptrtoint ptr [[ARR_END]] to i64 // CHECK: [[B:%.+]] = ptrtoint ptr [[A_ADDR]] to i64 diff --git a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp index c90819dc2a22f..6d1c0213d648c 100644 --- a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp +++ b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp @@ -49,14 +49,14 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds double, ptr [[TT]], i32 1 + // CK1: getelementptr inbounds nuw double, ptr [[TT]], i32 1 #pragma omp target data map(g[:10]) use_device_ptr(g) { ++g; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE00]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds double, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw double, ptr [[TTT]], i32 1 ++g; // CK1: [[T1:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -67,26 +67,26 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds float, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10]) use_device_ptr(l) { ++l; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE01]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 ++l; // CK1-NOT: call void @__tgt_target // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 #pragma omp target data map(l[:10]) use_device_ptr(l) if(0) { ++l; } // CK1-NOT: call void @__tgt_target // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 ++l; // CK1: [[T1:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -97,14 +97,14 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds float, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10]) use_device_ptr(l) if(1) { ++l; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE03]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 ++l; // CK1: [[CMP:%.+]] = icmp ne ptr %{{.+}}, null @@ -119,12 +119,12 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds float, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TT1]], i32 1 // CK1: br label %[[BEND:.+]] // CK1: [[BELSE]]: // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 // CK1: br label %[[BEND]] #pragma omp target data map(l[:10]) use_device_ptr(l) if(lr != 0) { @@ -142,7 +142,7 @@ void foo(float *&lr, T *&tr) { // CK1: [[BEND]]: // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 ++l; // CK1: [[T2:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -156,7 +156,7 @@ void foo(float *&lr, T *&tr) { // CK1: store ptr [[PVTV]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], // CK1: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK1: getelementptr inbounds float, ptr [[TT2]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TT2]], i32 1 #pragma omp target data map(lr[:10]) use_device_ptr(lr) { ++lr; @@ -164,7 +164,7 @@ void foo(float *&lr, T *&tr) { // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE05]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], // CK1: [[TTTT:%.+]] = load ptr, ptr [[TTT]], - // CK1: getelementptr inbounds float, ptr [[TTTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTTT]], i32 1 ++lr; // CK1: [[T1:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -175,14 +175,14 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds i32, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT1]], i32 1 #pragma omp target data map(t[:10]) use_device_ptr(t) { ++t; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE06]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds i32, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTT]], i32 1 ++t; // CK1: [[T2:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -196,7 +196,7 @@ void foo(float *&lr, T *&tr) { // CK1: store ptr [[PVTV]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], // CK1: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK1: getelementptr inbounds i32, ptr [[TT2]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT2]], i32 1 #pragma omp target data map(tr[:10]) use_device_ptr(tr) { ++tr; @@ -204,7 +204,7 @@ void foo(float *&lr, T *&tr) { // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE07]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], // CK1: [[TTTT:%.+]] = load ptr, ptr [[TTT]], - // CK1: getelementptr inbounds i32, ptr [[TTTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1 ++tr; // CK1: [[T1:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -215,14 +215,14 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds float, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l) { ++l; ++t; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE08]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds float, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[TTT]], i32 1 ++l; ++t; @@ -232,18 +232,18 @@ void foo(float *&lr, T *&tr) { // CK1: [[VAL:%.+]] = load ptr, ptr {{%.+}}, // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[_TT1:%.+]] = load ptr, ptr [[_PVT]], - // CK1: getelementptr inbounds float, ptr [[_TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[_TT1]], i32 1 // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds i32, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l) use_device_ptr(t) { ++l; ++t; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE09]] // CK1: [[_TTT:%.+]] = load ptr, ptr {{%.+}}, - // CK1: getelementptr inbounds float, ptr [[_TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[_TTT]], i32 1 // CK1: [[TTT:%.+]] = load ptr, ptr {{%.+}}, - // CK1: getelementptr inbounds i32, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTT]], i32 1 ++l; ++t; // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE10]] @@ -252,18 +252,18 @@ void foo(float *&lr, T *&tr) { // CK1: [[VAL:%.+]] = load ptr, ptr {{%.+}}, // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[_TT1:%.+]] = load ptr, ptr [[_PVT]], - // CK1: getelementptr inbounds float, ptr [[_TT1]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[_TT1]], i32 1 // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds i32, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l,t) { ++l; ++t; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE10]] // CK1: [[_TTT:%.+]] = load ptr, ptr {{%.+}}, - // CK1: getelementptr inbounds float, ptr [[_TTT]], i32 1 + // CK1: getelementptr inbounds nuw float, ptr [[_TTT]], i32 1 // CK1: [[TTT:%.+]] = load ptr, ptr {{%.+}}, - // CK1: getelementptr inbounds i32, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTT]], i32 1 ++l; ++t; // CK1: [[T1:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -274,14 +274,14 @@ void foo(float *&lr, T *&tr) { // CK1-NOT: store ptr [[VAL]], ptr [[DECL]], // CK1: store ptr [[VAL]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], - // CK1: getelementptr inbounds i32, ptr [[TT1]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT1]], i32 1 #pragma omp target data map(l[:10]) use_device_ptr(t) { ++l; ++t; } // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE11]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK1: getelementptr inbounds i32, ptr [[TTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTT]], i32 1 ++l; ++t; // CK1: [[T2:%.+]] = load ptr, ptr [[DECL:%.+]], @@ -295,7 +295,7 @@ void foo(float *&lr, T *&tr) { // CK1: store ptr [[PVTV]], ptr [[PVT:%.+]], // CK1: [[TT1:%.+]] = load ptr, ptr [[PVT]], // CK1: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK1: getelementptr inbounds i32, ptr [[TT2]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TT2]], i32 1 #pragma omp target data map(l[:10]) use_device_ptr(tr) { ++l; ++tr; @@ -303,7 +303,7 @@ void foo(float *&lr, T *&tr) { // CK1: call void @__tgt_target_data_end{{.+}}[[MTYPE12]] // CK1: [[TTT:%.+]] = load ptr, ptr [[DECL]], // CK1: [[TTTT:%.+]] = load ptr, ptr [[TTT]], - // CK1: getelementptr inbounds i32, ptr [[TTTT]], i32 1 + // CK1: getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1 ++l; ++tr; } @@ -354,7 +354,7 @@ struct ST { // CK2: store ptr [[PVT]], ptr [[PVT2:%.+]], // CK2: [[TT1:%.+]] = load ptr, ptr [[PVT2]], // CK2: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK2: getelementptr inbounds double, ptr [[TT2]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TT2]], i32 1 #pragma omp target data map(a[:10]) use_device_ptr(a) { a++; @@ -362,7 +362,7 @@ struct ST { // CK2: call void @__tgt_target_data_end{{.+}}[[MTYPE00]] // CK2: [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0 // CK2: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK2: getelementptr inbounds double, ptr [[TTT]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TTT]], i32 1 a++; // CK2: [[BP:%.+]] = getelementptr inbounds [2 x ptr], ptr %{{.+}}, i32 0, i32 1 @@ -373,7 +373,7 @@ struct ST { // CK2: store ptr [[PVT]], ptr [[PVT2:%.+]], // CK2: [[TT1:%.+]] = load ptr, ptr [[PVT2]], // CK2: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK2: getelementptr inbounds double, ptr [[TT2]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TT2]], i32 1 #pragma omp target data map(b[:10]) use_device_ptr(b) { b++; @@ -382,7 +382,7 @@ struct ST { // CK2: [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %{{.+}}, i32 0, i32 1 // CK2: [[TTT:%.+]] = load ptr, ptr [[DECL]], // CK2: [[TTTT:%.+]] = load ptr, ptr [[TTT]], - // CK2: getelementptr inbounds double, ptr [[TTTT]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TTTT]], i32 1 b++; // CK2: [[BP:%.+]] = getelementptr inbounds [3 x ptr], ptr %{{.+}}, i32 0, i32 2 @@ -393,7 +393,7 @@ struct ST { // CK2: store ptr [[PVT]], ptr [[PVT2:%.+]], // CK2: [[TT1:%.+]] = load ptr, ptr [[PVT2]], // CK2: [[TT2:%.+]] = load ptr, ptr [[TT1]], - // CK2: getelementptr inbounds double, ptr [[TT2]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TT2]], i32 1 #pragma omp target data map(la[:10]) use_device_ptr(a) { a++; @@ -402,7 +402,7 @@ struct ST { // CK2: call void @__tgt_target_data_end{{.+}}[[MTYPE02]] // CK2: [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0 // CK2: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK2: getelementptr inbounds double, ptr [[TTT]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TTT]], i32 1 a++; la++; @@ -419,10 +419,10 @@ struct ST { // CK2: store ptr [[PVT1]], ptr [[_PVT1:%.+]], // CK2: [[TT2:%.+]] = load ptr, ptr [[_PVT2]], // CK2: [[_TT2:%.+]] = load ptr, ptr [[TT2]], - // CK2: getelementptr inbounds double, ptr [[_TT2]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[_TT2]], i32 1 // CK2: [[TT1:%.+]] = load ptr, ptr [[_PVT1]], // CK2: [[_TT1:%.+]] = load ptr, ptr [[TT1]], - // CK2: getelementptr inbounds double, ptr [[_TT1]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[_TT1]], i32 1 #pragma omp target data map(b[:10]) use_device_ptr(a, b) { a++; @@ -431,11 +431,11 @@ struct ST { // CK2: call void @__tgt_target_data_end{{.+}}[[MTYPE03]] // CK2: [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0 // CK2: [[TTT:%.+]] = load ptr, ptr [[DECL]], - // CK2: getelementptr inbounds double, ptr [[TTT]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[TTT]], i32 1 // CK2: [[_DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 1 // CK2: [[_TTT:%.+]] = load ptr, ptr [[_DECL]], // CK2: [[_TTTT:%.+]] = load ptr, ptr [[_TTT]], - // CK2: getelementptr inbounds double, ptr [[_TTTT]], i32 1 + // CK2: getelementptr inbounds nuw double, ptr [[_TTTT]], i32 1 a++; b++; } diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp index 08bcc87ca5f0a..39eaedb0e48d1 100644 --- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp +++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp @@ -586,7 +586,7 @@ void use_template() { // CHECK-NEXT: store ptr [[K]], ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CHECK-NEXT: ret void // @@ -601,7 +601,7 @@ void use_template() { // CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8 // CHECK-NEXT: ret void // @@ -1079,7 +1079,7 @@ void use_template() { // CHECK-NEXT: store ptr [[K]], ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CHECK-NEXT: ret void // @@ -1094,7 +1094,7 @@ void use_template() { // CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8 // CHECK-NEXT: ret void // @@ -1133,7 +1133,7 @@ void use_template() { // CHECK-NEXT: store ptr [[K]], ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CHECK-NEXT: ret void // @@ -1148,7 +1148,7 @@ void use_template() { // CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 1 +// CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i32 1 // CHECK-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8 // CHECK-NEXT: ret void // @@ -1422,14 +1422,14 @@ void use_template() { // SIMD-ONLY0-NEXT: store ptr [[K]], ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: store ptr [[AA]], ptr [[RAA]], align 8 // SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load ptr, ptr [[K]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR]], ptr [[K]], align 8 // SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8 // SIMD-ONLY0-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0 // SIMD-ONLY0-NEXT: store i32 1, ptr [[ARRAYIDX]], align 4 @@ -1478,14 +1478,14 @@ void use_template() { // SIMD-ONLY0-NEXT: store ptr [[TMP0]], ptr [[K]], align 8 // SIMD-ONLY0-NEXT: store ptr [[K]], ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load ptr, ptr [[K]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR]], ptr [[K]], align 8 // SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8 // SIMD-ONLY0-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0 // SIMD-ONLY0-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 @@ -1520,14 +1520,14 @@ void use_template() { // SIMD-ONLY0-NEXT: store ptr [[TMP0]], ptr [[K]], align 8 // SIMD-ONLY0-NEXT: store ptr [[K]], ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load ptr, ptr [[K]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR]], ptr [[K]], align 8 // SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8 // SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 // SIMD-ONLY0-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 1 +// SIMD-ONLY0-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i32 1 // SIMD-ONLY0-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8 // SIMD-ONLY0-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[AA]], i64 0, i64 0 // SIMD-ONLY0-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 diff --git a/clang/test/OpenMP/target_in_reduction_codegen.cpp b/clang/test/OpenMP/target_in_reduction_codegen.cpp index fb715e2de2a59..56191ee575136 100644 --- a/clang/test/OpenMP/target_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_in_reduction_codegen.cpp @@ -70,7 +70,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -85,7 +85,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP12]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -100,7 +100,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP21]], align 8 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP22]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP24]], align 8 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -118,7 +118,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP35:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP35]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP36]], align 8 // CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -133,7 +133,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP43]], align 8 // CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP44]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP46]], align 8 // CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp index 3a1c168533c37..505c34e21733c 100644 --- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp +++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp @@ -2142,7 +2142,7 @@ void bar() { // CK10-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 // CK10-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8 // CK10-NEXT: ret void // @@ -2153,7 +2153,7 @@ void bar() { // CK10-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 8 // CK10-NEXT: store ptr [[L]], ptr [[L_ADDR]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8 // CK10-NEXT: ret void // @@ -2164,7 +2164,7 @@ void bar() { // CK10-NEXT: [[T_ADDR:%.*]] = alloca ptr, align 8 // CK10-NEXT: store ptr [[T]], ptr [[T_ADDR]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8 // CK10-NEXT: ret void // @@ -2178,7 +2178,7 @@ void bar() { // CK10-NEXT: store ptr [[LR_ADDR]], ptr [[TMP]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK10-NEXT: ret void // @@ -2192,7 +2192,7 @@ void bar() { // CK10-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK10-NEXT: ret void // @@ -2206,7 +2206,7 @@ void bar() { // CK10-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK10-NEXT: ret void // @@ -2224,11 +2224,11 @@ void bar() { // CK10-NEXT: store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8 // CK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK10-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 // CK10-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 -// CK10-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +// CK10-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1 // CK10-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8 // CK10-NEXT: ret void // @@ -2613,7 +2613,7 @@ void bar() { // CK11-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 // CK11-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8 // CK11-NEXT: ret void // @@ -2624,7 +2624,7 @@ void bar() { // CK11-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 8 // CK11-NEXT: store ptr [[L]], ptr [[L_ADDR]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8 // CK11-NEXT: ret void // @@ -2635,7 +2635,7 @@ void bar() { // CK11-NEXT: [[T_ADDR:%.*]] = alloca ptr, align 8 // CK11-NEXT: store ptr [[T]], ptr [[T_ADDR]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8 // CK11-NEXT: ret void // @@ -2649,7 +2649,7 @@ void bar() { // CK11-NEXT: store ptr [[LR_ADDR]], ptr [[TMP]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK11-NEXT: ret void // @@ -2663,7 +2663,7 @@ void bar() { // CK11-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK11-NEXT: ret void // @@ -2677,7 +2677,7 @@ void bar() { // CK11-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK11-NEXT: ret void // @@ -2695,11 +2695,11 @@ void bar() { // CK11-NEXT: store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8 // CK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 // CK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 -// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8 // CK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8 // CK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 -// CK11-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +// CK11-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1 // CK11-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8 // CK11-NEXT: ret void // @@ -3084,7 +3084,7 @@ void bar() { // CK12-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4 // CK12-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4 // CK12-NEXT: ret void // @@ -3095,7 +3095,7 @@ void bar() { // CK12-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 4 // CK12-NEXT: store ptr [[L]], ptr [[L_ADDR]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4 // CK12-NEXT: ret void // @@ -3106,7 +3106,7 @@ void bar() { // CK12-NEXT: [[T_ADDR:%.*]] = alloca ptr, align 4 // CK12-NEXT: store ptr [[T]], ptr [[T_ADDR]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4 // CK12-NEXT: ret void // @@ -3120,7 +3120,7 @@ void bar() { // CK12-NEXT: store ptr [[LR_ADDR]], ptr [[TMP]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK12-NEXT: ret void // @@ -3134,7 +3134,7 @@ void bar() { // CK12-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK12-NEXT: ret void // @@ -3148,7 +3148,7 @@ void bar() { // CK12-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK12-NEXT: ret void // @@ -3166,11 +3166,11 @@ void bar() { // CK12-NEXT: store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4 // CK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4 // CK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4 -// CK12-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +// CK12-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1 // CK12-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4 // CK12-NEXT: ret void // @@ -3555,7 +3555,7 @@ void bar() { // CK13-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4 // CK13-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4 // CK13-NEXT: ret void // @@ -3566,7 +3566,7 @@ void bar() { // CK13-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 4 // CK13-NEXT: store ptr [[L]], ptr [[L_ADDR]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4 // CK13-NEXT: ret void // @@ -3577,7 +3577,7 @@ void bar() { // CK13-NEXT: [[T_ADDR:%.*]] = alloca ptr, align 4 // CK13-NEXT: store ptr [[T]], ptr [[T_ADDR]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4 // CK13-NEXT: ret void // @@ -3591,7 +3591,7 @@ void bar() { // CK13-NEXT: store ptr [[LR_ADDR]], ptr [[TMP]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK13-NEXT: ret void // @@ -3605,7 +3605,7 @@ void bar() { // CK13-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK13-NEXT: ret void // @@ -3619,7 +3619,7 @@ void bar() { // CK13-NEXT: store ptr [[TR_ADDR]], ptr [[TMP]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK13-NEXT: ret void // @@ -3637,11 +3637,11 @@ void bar() { // CK13-NEXT: store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4 // CK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4 // CK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1 +// CK13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4 // CK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4 // CK13-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4 -// CK13-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1 +// CK13-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1 // CK13-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4 // CK13-NEXT: ret void // @@ -3674,34 +3674,34 @@ void bar() { // SIMD-ONLY00-NEXT: store ptr [[LR]], ptr [[LR_ADDR]], align 8 // SIMD-ONLY00-NEXT: store ptr [[TR]], ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: [[TMP0:%.*]] = load ptr, ptr @g, align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR]], ptr @g, align 8 // SIMD-ONLY00-NEXT: [[TMP1:%.*]] = load ptr, ptr [[L]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR1]], ptr [[L]], align 8 // SIMD-ONLY00-NEXT: [[TMP2:%.*]] = load ptr, ptr [[T]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR2]], ptr [[T]], align 8 // SIMD-ONLY00-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY00-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 // SIMD-ONLY00-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY00-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 // SIMD-ONLY00-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8 // SIMD-ONLY00-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: store ptr [[TMP7]], ptr [[_TMP4]], align 8 // SIMD-ONLY00-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8 // SIMD-ONLY00-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8 // SIMD-ONLY00-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: store ptr [[TMP11]], ptr [[_TMP6]], align 8 // SIMD-ONLY00-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8 // SIMD-ONLY00-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8 // SIMD-ONLY00-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY00-NEXT: store ptr [[TMP15]], ptr [[_TMP8]], align 8 @@ -3711,11 +3711,11 @@ void bar() { // SIMD-ONLY00-NEXT: [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY00-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8 // SIMD-ONLY00-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8 // SIMD-ONLY00-NEXT: [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8 // SIMD-ONLY00-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8 -// SIMD-ONLY00-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1 +// SIMD-ONLY00-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1 // SIMD-ONLY00-NEXT: store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8 // SIMD-ONLY00-NEXT: ret void // @@ -3748,34 +3748,34 @@ void bar() { // SIMD-ONLY01-NEXT: store ptr [[LR]], ptr [[LR_ADDR]], align 8 // SIMD-ONLY01-NEXT: store ptr [[TR]], ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: [[TMP0:%.*]] = load ptr, ptr @g, align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR]], ptr @g, align 8 // SIMD-ONLY01-NEXT: [[TMP1:%.*]] = load ptr, ptr [[L]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR1]], ptr [[L]], align 8 // SIMD-ONLY01-NEXT: [[TMP2:%.*]] = load ptr, ptr [[T]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR2]], ptr [[T]], align 8 // SIMD-ONLY01-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY01-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 // SIMD-ONLY01-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY01-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 // SIMD-ONLY01-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8 // SIMD-ONLY01-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: store ptr [[TMP7]], ptr [[_TMP4]], align 8 // SIMD-ONLY01-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8 // SIMD-ONLY01-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8 // SIMD-ONLY01-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: store ptr [[TMP11]], ptr [[_TMP6]], align 8 // SIMD-ONLY01-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8 // SIMD-ONLY01-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8 // SIMD-ONLY01-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8 // SIMD-ONLY01-NEXT: store ptr [[TMP15]], ptr [[_TMP8]], align 8 @@ -3785,11 +3785,11 @@ void bar() { // SIMD-ONLY01-NEXT: [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8 // SIMD-ONLY01-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8 // SIMD-ONLY01-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8 // SIMD-ONLY01-NEXT: [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8 // SIMD-ONLY01-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8 -// SIMD-ONLY01-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1 +// SIMD-ONLY01-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1 // SIMD-ONLY01-NEXT: store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8 // SIMD-ONLY01-NEXT: ret void // @@ -3822,34 +3822,34 @@ void bar() { // SIMD-ONLY02-NEXT: store ptr [[LR]], ptr [[LR_ADDR]], align 4 // SIMD-ONLY02-NEXT: store ptr [[TR]], ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: [[TMP0:%.*]] = load ptr, ptr @g, align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR]], ptr @g, align 4 // SIMD-ONLY02-NEXT: [[TMP1:%.*]] = load ptr, ptr [[L]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR1]], ptr [[L]], align 4 // SIMD-ONLY02-NEXT: [[TMP2:%.*]] = load ptr, ptr [[T]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR2]], ptr [[T]], align 4 // SIMD-ONLY02-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY02-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 4 // SIMD-ONLY02-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY02-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 // SIMD-ONLY02-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4 // SIMD-ONLY02-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: store ptr [[TMP7]], ptr [[_TMP4]], align 4 // SIMD-ONLY02-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4 // SIMD-ONLY02-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4 // SIMD-ONLY02-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: store ptr [[TMP11]], ptr [[_TMP6]], align 4 // SIMD-ONLY02-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4 // SIMD-ONLY02-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4 // SIMD-ONLY02-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY02-NEXT: store ptr [[TMP15]], ptr [[_TMP8]], align 4 @@ -3859,11 +3859,11 @@ void bar() { // SIMD-ONLY02-NEXT: [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY02-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4 // SIMD-ONLY02-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4 // SIMD-ONLY02-NEXT: [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4 // SIMD-ONLY02-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4 -// SIMD-ONLY02-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1 +// SIMD-ONLY02-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1 // SIMD-ONLY02-NEXT: store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4 // SIMD-ONLY02-NEXT: ret void // @@ -3896,34 +3896,34 @@ void bar() { // SIMD-ONLY03-NEXT: store ptr [[LR]], ptr [[LR_ADDR]], align 4 // SIMD-ONLY03-NEXT: store ptr [[TR]], ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: [[TMP0:%.*]] = load ptr, ptr @g, align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR]], ptr @g, align 4 // SIMD-ONLY03-NEXT: [[TMP1:%.*]] = load ptr, ptr [[L]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR1]], ptr [[L]], align 4 // SIMD-ONLY03-NEXT: [[TMP2:%.*]] = load ptr, ptr [[T]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR2]], ptr [[T]], align 4 // SIMD-ONLY03-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY03-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 4 // SIMD-ONLY03-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY03-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 // SIMD-ONLY03-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4 // SIMD-ONLY03-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: store ptr [[TMP7]], ptr [[_TMP4]], align 4 // SIMD-ONLY03-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4 // SIMD-ONLY03-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4 // SIMD-ONLY03-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: store ptr [[TMP11]], ptr [[_TMP6]], align 4 // SIMD-ONLY03-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4 // SIMD-ONLY03-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4 // SIMD-ONLY03-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4 // SIMD-ONLY03-NEXT: store ptr [[TMP15]], ptr [[_TMP8]], align 4 @@ -3933,11 +3933,11 @@ void bar() { // SIMD-ONLY03-NEXT: [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4 // SIMD-ONLY03-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4 // SIMD-ONLY03-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4 // SIMD-ONLY03-NEXT: [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4 // SIMD-ONLY03-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4 -// SIMD-ONLY03-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1 +// SIMD-ONLY03-NEXT: [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1 // SIMD-ONLY03-NEXT: store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4 // SIMD-ONLY03-NEXT: ret void // @@ -3951,7 +3951,7 @@ void bar() { // CK20-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // CK20-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // CK20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 -// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK20-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8 // CK20-NEXT: ret void // @@ -4185,7 +4185,7 @@ void bar() { // CK20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CK20-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 8 -// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK20-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // CK20-NEXT: ret void // @@ -4199,7 +4199,7 @@ void bar() { // CK20-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1 // CK20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 8 // CK20-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // CK20-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8 // CK20-NEXT: ret void // @@ -4212,12 +4212,12 @@ void bar() { // CK20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CK20-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 8 -// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK20-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK20-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // CK20-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1 // CK20-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B]], align 8 // CK20-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 -// CK20-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// CK20-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // CK20-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8 // CK20-NEXT: ret void // @@ -4231,7 +4231,7 @@ void bar() { // CK21-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // CK21-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // CK21-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 -// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK21-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8 // CK21-NEXT: ret void // @@ -4465,7 +4465,7 @@ void bar() { // CK21-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CK21-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK21-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 8 -// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK21-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // CK21-NEXT: ret void // @@ -4479,7 +4479,7 @@ void bar() { // CK21-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1 // CK21-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 8 // CK21-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // CK21-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8 // CK21-NEXT: ret void // @@ -4492,12 +4492,12 @@ void bar() { // CK21-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CK21-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK21-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 8 -// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK21-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK21-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // CK21-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1 // CK21-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B]], align 8 // CK21-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 -// CK21-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// CK21-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // CK21-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8 // CK21-NEXT: ret void // @@ -4511,7 +4511,7 @@ void bar() { // CK22-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // CK22-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // CK22-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4 -// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK22-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4 // CK22-NEXT: ret void // @@ -4745,7 +4745,7 @@ void bar() { // CK22-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CK22-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK22-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 4 -// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK22-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // CK22-NEXT: ret void // @@ -4759,7 +4759,7 @@ void bar() { // CK22-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1 // CK22-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 4 // CK22-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // CK22-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4 // CK22-NEXT: ret void // @@ -4772,12 +4772,12 @@ void bar() { // CK22-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CK22-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK22-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 4 -// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK22-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK22-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // CK22-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1 // CK22-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B]], align 4 // CK22-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4 -// CK22-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// CK22-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // CK22-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4 // CK22-NEXT: ret void // @@ -4791,7 +4791,7 @@ void bar() { // CK23-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // CK23-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // CK23-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4 -// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // CK23-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4 // CK23-NEXT: ret void // @@ -5025,7 +5025,7 @@ void bar() { // CK23-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CK23-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK23-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 4 -// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK23-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // CK23-NEXT: ret void // @@ -5039,7 +5039,7 @@ void bar() { // CK23-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1 // CK23-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 4 // CK23-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // CK23-NEXT: store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4 // CK23-NEXT: ret void // @@ -5052,12 +5052,12 @@ void bar() { // CK23-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CK23-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0 // CK23-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 4 -// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1 +// CK23-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1 // CK23-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // CK23-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1 // CK23-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B]], align 4 // CK23-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4 -// CK23-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// CK23-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // CK23-NEXT: store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4 // CK23-NEXT: ret void // @@ -5071,7 +5071,7 @@ void bar() { // SIMD-ONLY10-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // SIMD-ONLY10-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // SIMD-ONLY10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 -// SIMD-ONLY10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY10-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8 // SIMD-ONLY10-NEXT: ret void // @@ -5101,21 +5101,21 @@ void bar() { // SIMD-ONLY10-NEXT: store ptr null, ptr [[LA]], align 8 // SIMD-ONLY10-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A]], align 8 -// SIMD-ONLY10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY10-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY10-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // SIMD-ONLY10-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 8 // SIMD-ONLY10-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// SIMD-ONLY10-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// SIMD-ONLY10-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // SIMD-ONLY10-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8 // SIMD-ONLY10-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY10-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8 -// SIMD-ONLY10-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// SIMD-ONLY10-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // SIMD-ONLY10-NEXT: store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8 // SIMD-ONLY10-NEXT: [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY10-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8 // SIMD-ONLY10-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// SIMD-ONLY10-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1 +// SIMD-ONLY10-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1 // SIMD-ONLY10-NEXT: store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8 // SIMD-ONLY10-NEXT: ret void // @@ -5145,7 +5145,7 @@ void bar() { // SIMD-ONLY11-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // SIMD-ONLY11-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]]) // SIMD-ONLY11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 -// SIMD-ONLY11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY11-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8 // SIMD-ONLY11-NEXT: ret void // @@ -5175,21 +5175,21 @@ void bar() { // SIMD-ONLY11-NEXT: store ptr null, ptr [[LA]], align 8 // SIMD-ONLY11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A]], align 8 -// SIMD-ONLY11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY11-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY11-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 8 // SIMD-ONLY11-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 8 // SIMD-ONLY11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 -// SIMD-ONLY11-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// SIMD-ONLY11-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // SIMD-ONLY11-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8 // SIMD-ONLY11-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8 -// SIMD-ONLY11-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// SIMD-ONLY11-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // SIMD-ONLY11-NEXT: store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8 // SIMD-ONLY11-NEXT: [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8 // SIMD-ONLY11-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// SIMD-ONLY11-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1 +// SIMD-ONLY11-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1 // SIMD-ONLY11-NEXT: store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8 // SIMD-ONLY11-NEXT: ret void // @@ -5219,7 +5219,7 @@ void bar() { // SIMD-ONLY12-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // SIMD-ONLY12-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // SIMD-ONLY12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4 -// SIMD-ONLY12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY12-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4 // SIMD-ONLY12-NEXT: ret void // @@ -5249,21 +5249,21 @@ void bar() { // SIMD-ONLY12-NEXT: store ptr null, ptr [[LA]], align 4 // SIMD-ONLY12-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A]], align 4 -// SIMD-ONLY12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY12-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY12-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // SIMD-ONLY12-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 4 // SIMD-ONLY12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -// SIMD-ONLY12-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// SIMD-ONLY12-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // SIMD-ONLY12-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4 // SIMD-ONLY12-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4 -// SIMD-ONLY12-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// SIMD-ONLY12-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // SIMD-ONLY12-NEXT: store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4 // SIMD-ONLY12-NEXT: [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY12-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4 // SIMD-ONLY12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4 -// SIMD-ONLY12-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1 +// SIMD-ONLY12-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1 // SIMD-ONLY12-NEXT: store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4 // SIMD-ONLY12-NEXT: ret void // @@ -5293,7 +5293,7 @@ void bar() { // SIMD-ONLY13-NEXT: call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // SIMD-ONLY13-NEXT: call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]]) // SIMD-ONLY13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4 -// SIMD-ONLY13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY13-NEXT: store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4 // SIMD-ONLY13-NEXT: ret void // @@ -5323,21 +5323,21 @@ void bar() { // SIMD-ONLY13-NEXT: store ptr null, ptr [[LA]], align 4 // SIMD-ONLY13-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A]], align 4 -// SIMD-ONLY13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1 +// SIMD-ONLY13-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1 // SIMD-ONLY13-NEXT: store ptr [[INCDEC_PTR]], ptr [[A]], align 4 // SIMD-ONLY13-NEXT: [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B]], align 4 // SIMD-ONLY13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4 -// SIMD-ONLY13-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1 +// SIMD-ONLY13-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1 // SIMD-ONLY13-NEXT: store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4 // SIMD-ONLY13-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 // SIMD-ONLY13-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4 -// SIMD-ONLY13-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1 +// SIMD-ONLY13-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1 // SIMD-ONLY13-NEXT: store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4 // SIMD-ONLY13-NEXT: [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 // SIMD-ONLY13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4 // SIMD-ONLY13-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4 -// SIMD-ONLY13-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1 +// SIMD-ONLY13-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1 // SIMD-ONLY13-NEXT: store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4 // SIMD-ONLY13-NEXT: ret void // diff --git a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp index fcaceac7d3467..87fa7fe462daa 100644 --- a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp +++ b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp @@ -45,7 +45,7 @@ void foo() { // CHECK-NEXT: store ptr [[CALL]], ptr [[PTR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 0 // CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK-NEXT: store ptr [[PTR]], ptr [[TMP2]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 diff --git a/clang/test/OpenMP/target_map_codegen_01.cpp b/clang/test/OpenMP/target_map_codegen_01.cpp index d112500eb5fdd..9f3553d2377cb 100644 --- a/clang/test/OpenMP/target_map_codegen_01.cpp +++ b/clang/test/OpenMP/target_map_codegen_01.cpp @@ -108,6 +108,6 @@ void implicit_maps_reference (int a, int *b){ // CK2: store ptr [[ADDR]], ptr [[REF]], // CK2: [[T:%.+]] = load ptr, ptr [[REF]], // CK2: [[TT:%.+]] = load ptr, ptr [[T]], -// CK2: getelementptr inbounds i32, ptr [[TT]], i32 1 +// CK2: getelementptr inbounds nuw i32, ptr [[TT]], i32 1 #endif // CK2 #endif diff --git a/clang/test/OpenMP/target_map_codegen_21.cpp b/clang/test/OpenMP/target_map_codegen_21.cpp index a1419b7d4beb8..f5c517692d8c8 100644 --- a/clang/test/OpenMP/target_map_codegen_21.cpp +++ b/clang/test/OpenMP/target_map_codegen_21.cpp @@ -185,7 +185,7 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: store ptr @c, ptr [[BP0]] -// CK22-DAG: store ptr getelementptr inbounds ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] +// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] // CK22: call void [[CALL03:@.+]](ptr {{[^,]+}}) #pragma omp target map(c [1:4]) @@ -277,7 +277,7 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: store ptr @sc, ptr [[BP0]] -// CK22-DAG: store ptr getelementptr inbounds ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] +// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] // CK22: call void [[CALL08:@.+]](ptr {{[^,]+}}) #pragma omp target map(sc [1:4]) @@ -369,7 +369,7 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: store ptr @stc, ptr [[BP0]] -// CK22-DAG: store ptr getelementptr inbounds ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] +// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]] // CK22: call void [[CALL13:@.+]](ptr {{[^,]+}}) #pragma omp target map(stc [1:4]) diff --git a/clang/test/OpenMP/target_map_codegen_27.cpp b/clang/test/OpenMP/target_map_codegen_27.cpp index fe7ae12e00d13..bfe75bca481be 100644 --- a/clang/test/OpenMP/target_map_codegen_27.cpp +++ b/clang/test/OpenMP/target_map_codegen_27.cpp @@ -82,7 +82,7 @@ void explicit_maps_pointer_references (int *p){ // CK28-DAG: store ptr [[VAR1:%.+]], ptr [[P0]] // CK28-DAG: [[VAR0]] = load ptr, ptr [[VAR00:%.+]], // CK28-DAG: [[VAR00]] = load ptr, ptr [[VAR000:%.+]], -// CK28-DAG: [[VAR1]] = getelementptr inbounds i32, ptr [[VAR11:%.+]], i{{64|32}} 2 +// CK28-DAG: [[VAR1]] = getelementptr inbounds nuw i32, ptr [[VAR11:%.+]], i{{64|32}} 2 // CK28-DAG: [[VAR11]] = load ptr, ptr [[VAR111:%.+]], // CK28-DAG: [[VAR111]] = load ptr, ptr [[VAR1111:%.+]], diff --git a/clang/test/OpenMP/target_map_codegen_28.cpp b/clang/test/OpenMP/target_map_codegen_28.cpp index e92f7e4773ecf..67ea72d791d03 100644 --- a/clang/test/OpenMP/target_map_codegen_28.cpp +++ b/clang/test/OpenMP/target_map_codegen_28.cpp @@ -89,7 +89,7 @@ struct SSB{ // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]] // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]] // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1 -// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0 +// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}}, // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}}) @@ -129,7 +129,7 @@ struct SSB{ // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 // CK29-DAG: store ptr [[VAR1]], ptr [[BP2]] // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]] -// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0 +// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}}, // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}}) @@ -164,7 +164,7 @@ struct SSB{ // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]] // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]] // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1 -// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0 +// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}}, // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}}) diff --git a/clang/test/OpenMP/target_map_codegen_29.cpp b/clang/test/OpenMP/target_map_codegen_29.cpp index 936a01573c2d2..3ca7b228d26c2 100644 --- a/clang/test/OpenMP/target_map_codegen_29.cpp +++ b/clang/test/OpenMP/target_map_codegen_29.cpp @@ -89,7 +89,7 @@ typedef struct StructWithPtrTag : public Base { // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 2 // CK30-DAG: store ptr [[S_PTR1_BEGIN:%.+]], ptr [[PTR]], // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4 -// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0 +// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0 // CK30-DAG: [[S_PTR1_BEGIN_REF]] = load ptr, ptr [[S_PTR1:%.+]], // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4 @@ -98,7 +98,7 @@ typedef struct StructWithPtrTag : public Base { // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 3 // CK30-DAG: store ptr [[S_PTRBASE1_BEGIN:%.+]], ptr [[PTR]], // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2 -// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0 +// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0 // CK30-DAG: [[S_PTRBASE1_BEGIN_REF]] = load ptr, ptr [[S_PTRBASE1:%.+]], // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2 void map_with_deep_copy() { diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp index 9d395b0ab8cd8..e61fc7296332b 100644 --- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp +++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp @@ -75,7 +75,7 @@ void foo(int **t1d) // CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8 // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8 // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 -// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 0 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i64 0 // CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 // CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 diff --git a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp index 7a0da002fb944..692e3a4214c9d 100644 --- a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp +++ b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp @@ -28,12 +28,12 @@ struct maptest { // CHECK: getelementptr inbounds // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0 // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0 - // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0 + // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0 // SZ = &this->s.data[6]-&this->s.data[0] // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0 // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0 - // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5 + // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5 // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1 // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64 // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64 @@ -64,12 +64,12 @@ struct maptest { // CHECK: [[SIZE:%.+]] = alloca [2 x i64], // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0 // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0 - // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0 + // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0 // SZ = &this->s.data[6]-&this->s.data[0] // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0 // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0 - // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5 + // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5 // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1 // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64 // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64 diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp index 9b64647928a24..fb36ba7b78d5b 100644 --- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp +++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp @@ -223,7 +223,7 @@ void foo() { // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[A4:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP10]], i32 0, i32 0 // CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[A4]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 0 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 0 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ASIZE]], align 4 // CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP12]] to i64 // CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[CONV]], 4 @@ -233,7 +233,7 @@ void foo() { // CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[D_ADDR]], align 8 // CHECK-NEXT: [[C5:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP16]], i32 0, i32 1 // CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[C5]], align 8 -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 0 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 0 // CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[CSIZE]], align 4 // CHECK-NEXT: [[CONV7:%.*]] = zext i32 [[TMP18]] to i64 // CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[CONV7]], 4 @@ -343,7 +343,7 @@ void foo() { // CHECK-NEXT: [[TMP79:%.*]] = load ptr, ptr [[_TMP12]], align 8 // CHECK-NEXT: [[C15:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP79]], i32 0, i32 1 // CHECK-NEXT: [[TMP80:%.*]] = load ptr, ptr [[C15]], align 8 -// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i64 0 +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP80]], i64 0 // CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[CSIZE]], align 4 // CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[TMP81]] to i64 // CHECK-NEXT: [[TMP82:%.*]] = mul nuw i64 [[CONV17]], 4 diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp index ffb145d8e50fc..775f0b296b1b6 100644 --- a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp +++ b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp @@ -45,7 +45,7 @@ void foo() { // CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1 // CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0 // CHECK-NEXT: store i32 222, ptr [[A]], align 4 -// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 0 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [10 x %struct.D], ptr [[SA]], i64 0, i64 0 // CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK-NEXT: store ptr [[SA]], ptr [[TMP0]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 diff --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp index 3d0710acf0ee7..5cce677e88572 100644 --- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp @@ -96,16 +96,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -135,7 +135,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -150,19 +150,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8 @@ -483,9 +483,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp index 28d63dbf8c4a9..c0bb4a6d6cc82 100644 --- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp @@ -85,16 +85,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -124,7 +124,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -139,19 +139,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -429,9 +429,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp index 85c5d63a6cd9c..53960cee4b730 100644 --- a/clang/test/OpenMP/target_task_affinity_codegen.cpp +++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp @@ -76,7 +76,7 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP3]], align 8 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -102,7 +102,7 @@ int main() { // CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[B]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[B]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i64 0 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[TMP17]], ptr [[TMP19]], align 8 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0 @@ -174,9 +174,9 @@ int main() { // CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.) // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0 // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023 +// CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1 // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64 @@ -299,7 +299,7 @@ int main() { // CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 0 // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[TMP1]], ptr [[TMP3]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -325,7 +325,7 @@ int main() { // CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 4 // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[B]], align 4 // CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[B]], align 4 -// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 +// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[TMP17]], ptr [[TMP19]], align 4 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0 @@ -397,9 +397,9 @@ int main() { // CHECK3-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.) // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 // CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// CHECK3-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023 +// CHECK3-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1 // CHECK3-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32 // CHECK3-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32 @@ -587,9 +587,9 @@ int main() { // CHECK9-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.) // CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0 // CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0 // CHECK9-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8 -// CHECK9-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023 +// CHECK9-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023 // CHECK9-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1 // CHECK9-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK9-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64 @@ -709,9 +709,9 @@ int main() { // CHECK11-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.) // CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0 // CHECK11-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023 +// CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023 // CHECK11-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1 // CHECK11-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32 // CHECK11-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32 diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp index 6f671dbb27abb..2c36b410af064 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp @@ -91,16 +91,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -130,7 +130,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -145,19 +145,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -435,16 +435,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]] // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] @@ -474,7 +474,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP24]], align 8 // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -489,19 +489,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..4, ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0 // CHECK1-NEXT: [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0 // CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]] // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9 // CHECK1-NEXT: [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8 @@ -822,9 +822,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp index 5e038989ab6dd..c8211f475c7fc 100644 --- a/clang/test/OpenMP/target_update_codegen.cpp +++ b/clang/test/OpenMP/target_update_codegen.cpp @@ -1118,9 +1118,9 @@ struct ST { void foo(int arg) { ST arr[3][4]; // CK20: [[DIMS:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]], - // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 + // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 // CK20: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [[STRUCT_ST]]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}} + // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}} // CK20: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0 // CK20: store ptr [[ARR]], ptr [[BP0]], // CK20: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], {{.+}} 0, {{.+}} 0 @@ -1186,9 +1186,9 @@ struct ST { // CK21: _ZN2ST3fooEv void foo() { // CK21: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]], - // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0 + // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0 // CK21: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x [10 x ptr]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1 + // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1 // CK21: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0 // CK21: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 0 // CK21: [[BP0:%.+]] = getelementptr inbounds [2 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0 @@ -1262,9 +1262,9 @@ struct ST { // CK22: _ZN2ST3fooEPA10_Pi void foo(int *arr[5][10]) { // CK22: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]], - // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0 + // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0 // CK22: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1 + // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1 // CK22: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0 // CK22: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0 // CK22: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0 @@ -1338,11 +1338,11 @@ void foo(int arg) { float farr[5][5][5]; // CK23: [[ARG_ADDR:%.+]] = alloca i32, // CK23: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]], - // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 + // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 // CK23: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [5 x [5 x float]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_DECAY]], {{.+}} + // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x float], ptr [[ARRAY_DECAY]], {{.+}} // CK23: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0 - // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_2]], {{.+}} + // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_2]], {{.+}} // CK23: [[MUL:%.+]] = mul nuw i64 4, // CK23: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0 // CK23: store ptr [[ARR]], ptr [[BP0]], @@ -1411,11 +1411,11 @@ void foo(int arg) { void foo(int arg) { double darr[3][4][5]; // CK24: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]], - // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 + // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 // CK24: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x double]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_DECAY]], {{.+}} + // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x double], ptr [[ARRAY_DECAY]], {{.+}} // CK24: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0 - // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds double, ptr [[ARRAY_DECAY_2]], {{.+}} + // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw double, ptr [[ARRAY_DECAY_2]], {{.+}} // CK24: [[MUL:%.+]] = mul nuw i64 8, // CK24: [[SUB:%.+]] = sub nuw i64 4, [[ARG:%.+]] // CK24: [[LEN:%.+]] = udiv {{.+}} [[SUB]], 1 @@ -1488,15 +1488,15 @@ void foo(int arg) { // CK25: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]], // CK25: [[DIMS_2:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]], - // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 + // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0 // CK25: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x i32]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0 - // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_DECAY]], {{.+}} + // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x i32], ptr [[ARRAY_DECAY]], {{.+}} // CK25: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0 - // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1 + // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds nuw {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1 // CK25: [[LEN:%.+]] = sub nuw i64 4, [[ARG_ADDR:%.+]] - // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0 + // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds nuw [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0 // CK25: [[ARRAY_DECAY_5:%.+]] = getelementptr inbounds [3 x float], ptr [[ARRAY_IDX_4]], {{.+}} 0, {{.+}} 0 - // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1 + // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1 // CK25: [[BP0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[BP:%.+]], i{{.+}} 0, i{{.+}} 0 // CK25: [[P0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0 // CK25: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0 diff --git a/clang/test/OpenMP/task_codegen.c b/clang/test/OpenMP/task_codegen.c index 0d10cbce4aa80..d08eb3762d5c9 100644 --- a/clang/test/OpenMP/task_codegen.c +++ b/clang/test/OpenMP/task_codegen.c @@ -183,7 +183,7 @@ for (int i = 0; i < 10; ++i) // CHECK: [[A:%.+]] = load ptr, ptr [[A_ADDR:%.+]], // CHECK: [[K:%.+]] = load i32, ptr [[K_ADDR]], // CHECK: [[IDX:%.+]] = zext i32 [[K]] to i64 - // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IDX]] + // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds nuw ptr, ptr [[A]], i64 [[IDX]] // CHECK: [[AK:%.+]] = load ptr, ptr [[AK_ADDR]], // CHECK: [[I:%.+]] = load i32, ptr [[I_ADDR]], // CHECK: [[IDX:%.+]] = sext i32 [[I]] to i64 diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp index b256c41132ed3..c3e6d9e6b1cf7 100644 --- a/clang/test/OpenMP/task_codegen.cpp +++ b/clang/test/OpenMP/task_codegen.cpp @@ -309,9 +309,9 @@ void test_omp_all_memory() // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2 // CHECK1-NEXT: store i8 1, ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK1-NEXT: [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]] +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]] // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1 // CHECK1-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK1-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 @@ -346,13 +346,13 @@ void test_omp_all_memory() // CHECK1-NEXT: [[TMP58:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-NEXT: [[TMP59:%.*]] = sext i8 [[TMP58]] to i64 // CHECK1-NEXT: [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] // CHECK1-NEXT: [[TMP61:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-NEXT: [[TMP62:%.*]] = sext i8 [[TMP61]] to i64 // CHECK1-NEXT: [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] // CHECK1-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1 // CHECK1-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64 // CHECK1-NEXT: [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64 @@ -384,13 +384,13 @@ void test_omp_all_memory() // CHECK1-NEXT: [[TMP83:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-NEXT: [[TMP84:%.*]] = sext i8 [[TMP83]] to i64 // CHECK1-NEXT: [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] // CHECK1-NEXT: [[TMP86:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-NEXT: [[TMP87:%.*]] = sext i8 [[TMP86]] to i64 // CHECK1-NEXT: [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] // CHECK1-NEXT: [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1 // CHECK1-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64 // CHECK1-NEXT: [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64 @@ -427,8 +427,8 @@ void test_omp_all_memory() // CHECK1-NEXT: [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2 // CHECK1-NEXT: store i8 3, ptr [[TMP111]], align 8 // CHECK1-NEXT: [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]] -// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3 +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]] +// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3 // CHECK1-NEXT: [[TMP113:%.*]] = load i32, ptr @a, align 4 // CHECK1-NEXT: [[TMP114:%.*]] = sext i32 [[TMP113]] to i64 // CHECK1-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1 @@ -436,8 +436,8 @@ void test_omp_all_memory() // CHECK1-NEXT: [[TMP116:%.*]] = sext i32 [[TMP115]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]] // CHECK1-NEXT: [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]] -// CHECK1-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]] -// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] +// CHECK1-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]] +// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] // CHECK1-NEXT: [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK1-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK1-NEXT: [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64 @@ -1432,9 +1432,9 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2 // CHECK1-51-NEXT: store i8 1, ptr [[TMP33]], align 8 // CHECK1-51-NEXT: [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK1-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK1-51-NEXT: [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]] +// CHECK1-51-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]] // CHECK1-51-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1 // CHECK1-51-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK1-51-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 @@ -1469,13 +1469,13 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP58:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-51-NEXT: [[TMP59:%.*]] = sext i8 [[TMP58]] to i64 // CHECK1-51-NEXT: [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]] -// CHECK1-51-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] +// CHECK1-51-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]] +// CHECK1-51-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] // CHECK1-51-NEXT: [[TMP61:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-51-NEXT: [[TMP62:%.*]] = sext i8 [[TMP61]] to i64 // CHECK1-51-NEXT: [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]] -// CHECK1-51-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] +// CHECK1-51-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]] +// CHECK1-51-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] // CHECK1-51-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1 // CHECK1-51-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64 // CHECK1-51-NEXT: [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64 @@ -1507,13 +1507,13 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP83:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-51-NEXT: [[TMP84:%.*]] = sext i8 [[TMP83]] to i64 // CHECK1-51-NEXT: [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]] -// CHECK1-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] +// CHECK1-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]] +// CHECK1-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] // CHECK1-51-NEXT: [[TMP86:%.*]] = load i8, ptr [[B]], align 1 // CHECK1-51-NEXT: [[TMP87:%.*]] = sext i8 [[TMP86]] to i64 // CHECK1-51-NEXT: [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]] -// CHECK1-51-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] +// CHECK1-51-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]] +// CHECK1-51-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] // CHECK1-51-NEXT: [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1 // CHECK1-51-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64 // CHECK1-51-NEXT: [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64 @@ -1550,8 +1550,8 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2 // CHECK1-51-NEXT: store i8 3, ptr [[TMP111]], align 8 // CHECK1-51-NEXT: [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]] -// CHECK1-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3 +// CHECK1-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]] +// CHECK1-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3 // CHECK1-51-NEXT: [[TMP113:%.*]] = load i32, ptr @a, align 4 // CHECK1-51-NEXT: [[TMP114:%.*]] = sext i32 [[TMP113]] to i64 // CHECK1-51-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1 @@ -1559,8 +1559,8 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP116:%.*]] = sext i32 [[TMP115]] to i64 // CHECK1-51-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]] // CHECK1-51-NEXT: [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]] -// CHECK1-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] +// CHECK1-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]] +// CHECK1-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] // CHECK1-51-NEXT: [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK1-51-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK1-51-NEXT: [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64 @@ -1595,8 +1595,8 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2 // CHECK1-51-NEXT: store i8 8, ptr [[TMP139]], align 8 // CHECK1-51-NEXT: [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]] -// CHECK1-51-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3 +// CHECK1-51-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]] +// CHECK1-51-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3 // CHECK1-51-NEXT: [[TMP141:%.*]] = load i32, ptr @a, align 4 // CHECK1-51-NEXT: [[TMP142:%.*]] = sext i32 [[TMP141]] to i64 // CHECK1-51-NEXT: [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1 @@ -1604,8 +1604,8 @@ void test_omp_all_memory() // CHECK1-51-NEXT: [[TMP144:%.*]] = sext i32 [[TMP143]] to i64 // CHECK1-51-NEXT: [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]] // CHECK1-51-NEXT: [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]] -// CHECK1-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]] -// CHECK1-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]] +// CHECK1-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]] +// CHECK1-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]] // CHECK1-51-NEXT: [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK1-51-NEXT: [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64 // CHECK1-51-NEXT: [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64 @@ -3040,9 +3040,9 @@ void test_omp_all_memory() // CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2 // CHECK2-NEXT: store i8 1, ptr [[TMP33]], align 8 // CHECK2-NEXT: [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK2-NEXT: [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]] +// CHECK2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]] // CHECK2-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1 // CHECK2-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK2-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 @@ -3077,13 +3077,13 @@ void test_omp_all_memory() // CHECK2-NEXT: [[TMP58:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-NEXT: [[TMP59:%.*]] = sext i8 [[TMP58]] to i64 // CHECK2-NEXT: [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]] -// CHECK2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] +// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]] +// CHECK2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] // CHECK2-NEXT: [[TMP61:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-NEXT: [[TMP62:%.*]] = sext i8 [[TMP61]] to i64 // CHECK2-NEXT: [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]] -// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]] +// CHECK2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] // CHECK2-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1 // CHECK2-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64 // CHECK2-NEXT: [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64 @@ -3115,13 +3115,13 @@ void test_omp_all_memory() // CHECK2-NEXT: [[TMP83:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-NEXT: [[TMP84:%.*]] = sext i8 [[TMP83]] to i64 // CHECK2-NEXT: [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]] -// CHECK2-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] +// CHECK2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]] +// CHECK2-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] // CHECK2-NEXT: [[TMP86:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-NEXT: [[TMP87:%.*]] = sext i8 [[TMP86]] to i64 // CHECK2-NEXT: [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]] -// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] +// CHECK2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]] +// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] // CHECK2-NEXT: [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1 // CHECK2-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64 // CHECK2-NEXT: [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64 @@ -3158,8 +3158,8 @@ void test_omp_all_memory() // CHECK2-NEXT: [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2 // CHECK2-NEXT: store i8 3, ptr [[TMP111]], align 8 // CHECK2-NEXT: [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]] -// CHECK2-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3 +// CHECK2-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]] +// CHECK2-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3 // CHECK2-NEXT: [[TMP113:%.*]] = load i32, ptr @a, align 4 // CHECK2-NEXT: [[TMP114:%.*]] = sext i32 [[TMP113]] to i64 // CHECK2-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1 @@ -3167,8 +3167,8 @@ void test_omp_all_memory() // CHECK2-NEXT: [[TMP116:%.*]] = sext i32 [[TMP115]] to i64 // CHECK2-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]] // CHECK2-NEXT: [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]] -// CHECK2-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]] -// CHECK2-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] +// CHECK2-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]] +// CHECK2-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] // CHECK2-NEXT: [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK2-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK2-NEXT: [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64 @@ -4163,9 +4163,9 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2 // CHECK2-51-NEXT: store i8 1, ptr [[TMP33]], align 8 // CHECK2-51-NEXT: [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK2-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK2-51-NEXT: [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]] +// CHECK2-51-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]] // CHECK2-51-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1 // CHECK2-51-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK2-51-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 @@ -4200,13 +4200,13 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP58:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-51-NEXT: [[TMP59:%.*]] = sext i8 [[TMP58]] to i64 // CHECK2-51-NEXT: [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]] -// CHECK2-51-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] +// CHECK2-51-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]] +// CHECK2-51-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]] // CHECK2-51-NEXT: [[TMP61:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-51-NEXT: [[TMP62:%.*]] = sext i8 [[TMP61]] to i64 // CHECK2-51-NEXT: [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]] -// CHECK2-51-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] +// CHECK2-51-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]] +// CHECK2-51-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]] // CHECK2-51-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1 // CHECK2-51-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64 // CHECK2-51-NEXT: [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64 @@ -4238,13 +4238,13 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP83:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-51-NEXT: [[TMP84:%.*]] = sext i8 [[TMP83]] to i64 // CHECK2-51-NEXT: [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]] -// CHECK2-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] +// CHECK2-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]] +// CHECK2-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]] // CHECK2-51-NEXT: [[TMP86:%.*]] = load i8, ptr [[B]], align 1 // CHECK2-51-NEXT: [[TMP87:%.*]] = sext i8 [[TMP86]] to i64 // CHECK2-51-NEXT: [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]] -// CHECK2-51-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] +// CHECK2-51-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]] +// CHECK2-51-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]] // CHECK2-51-NEXT: [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1 // CHECK2-51-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64 // CHECK2-51-NEXT: [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64 @@ -4281,8 +4281,8 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2 // CHECK2-51-NEXT: store i8 3, ptr [[TMP111]], align 8 // CHECK2-51-NEXT: [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]] -// CHECK2-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3 +// CHECK2-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]] +// CHECK2-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3 // CHECK2-51-NEXT: [[TMP113:%.*]] = load i32, ptr @a, align 4 // CHECK2-51-NEXT: [[TMP114:%.*]] = sext i32 [[TMP113]] to i64 // CHECK2-51-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1 @@ -4290,8 +4290,8 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP116:%.*]] = sext i32 [[TMP115]] to i64 // CHECK2-51-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]] // CHECK2-51-NEXT: [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]] -// CHECK2-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] +// CHECK2-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]] +// CHECK2-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]] // CHECK2-51-NEXT: [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK2-51-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK2-51-NEXT: [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64 @@ -4326,8 +4326,8 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2 // CHECK2-51-NEXT: store i8 8, ptr [[TMP139]], align 8 // CHECK2-51-NEXT: [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]] -// CHECK2-51-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3 +// CHECK2-51-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]] +// CHECK2-51-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3 // CHECK2-51-NEXT: [[TMP141:%.*]] = load i32, ptr @a, align 4 // CHECK2-51-NEXT: [[TMP142:%.*]] = sext i32 [[TMP141]] to i64 // CHECK2-51-NEXT: [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1 @@ -4335,8 +4335,8 @@ void test_omp_all_memory() // CHECK2-51-NEXT: [[TMP144:%.*]] = sext i32 [[TMP143]] to i64 // CHECK2-51-NEXT: [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]] // CHECK2-51-NEXT: [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]] -// CHECK2-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]] -// CHECK2-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]] +// CHECK2-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]] +// CHECK2-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]] // CHECK2-51-NEXT: [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK2-51-NEXT: [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64 // CHECK2-51-NEXT: [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64 @@ -5773,9 +5773,9 @@ void test_omp_all_memory() // CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2 // CHECK3-NEXT: store i8 1, ptr [[TMP32]], align 8 // CHECK3-NEXT: [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]] // CHECK3-NEXT: [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK3-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1 // CHECK3-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK3-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64 @@ -5814,13 +5814,13 @@ void test_omp_all_memory() // CHECK3-NEXT: [[TMP57:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-NEXT: [[TMP58:%.*]] = sext i8 [[TMP57]] to i64 // CHECK3-NEXT: [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]] -// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] +// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]] +// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] // CHECK3-NEXT: [[TMP60:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-NEXT: [[TMP61:%.*]] = sext i8 [[TMP60]] to i64 // CHECK3-NEXT: [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]] -// CHECK3-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] +// CHECK3-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]] +// CHECK3-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] // CHECK3-NEXT: [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1 // CHECK3-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64 // CHECK3-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64 @@ -5854,13 +5854,13 @@ void test_omp_all_memory() // CHECK3-NEXT: [[TMP82:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-NEXT: [[TMP83:%.*]] = sext i8 [[TMP82]] to i64 // CHECK3-NEXT: [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]] -// CHECK3-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] +// CHECK3-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]] +// CHECK3-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] // CHECK3-NEXT: [[TMP85:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-NEXT: [[TMP86:%.*]] = sext i8 [[TMP85]] to i64 // CHECK3-NEXT: [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]] -// CHECK3-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] +// CHECK3-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]] +// CHECK3-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] // CHECK3-NEXT: [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK3-NEXT: [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK3-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64 @@ -5899,8 +5899,8 @@ void test_omp_all_memory() // CHECK3-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2 // CHECK3-NEXT: store i8 3, ptr [[TMP110]], align 8 // CHECK3-NEXT: [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]] -// CHECK3-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3 +// CHECK3-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]] +// CHECK3-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3 // CHECK3-NEXT: [[TMP112:%.*]] = load i32, ptr @a, align 4 // CHECK3-NEXT: [[TMP113:%.*]] = sext i32 [[TMP112]] to i64 // CHECK3-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1 @@ -5908,8 +5908,8 @@ void test_omp_all_memory() // CHECK3-NEXT: [[TMP115:%.*]] = sext i32 [[TMP114]] to i64 // CHECK3-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]] // CHECK3-NEXT: [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]] -// CHECK3-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]] -// CHECK3-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] +// CHECK3-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]] +// CHECK3-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] // CHECK3-NEXT: [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK3-NEXT: [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64 // CHECK3-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64 @@ -6789,9 +6789,9 @@ void test_omp_all_memory() // CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2 // CHECK4-NEXT: store i8 1, ptr [[TMP32]], align 8 // CHECK4-NEXT: [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]] +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]] // CHECK4-NEXT: [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK4-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK4-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1 // CHECK4-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK4-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64 @@ -6830,13 +6830,13 @@ void test_omp_all_memory() // CHECK4-NEXT: [[TMP57:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-NEXT: [[TMP58:%.*]] = sext i8 [[TMP57]] to i64 // CHECK4-NEXT: [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]] -// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] +// CHECK4-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]] +// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] // CHECK4-NEXT: [[TMP60:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-NEXT: [[TMP61:%.*]] = sext i8 [[TMP60]] to i64 // CHECK4-NEXT: [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]] -// CHECK4-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] +// CHECK4-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]] +// CHECK4-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] // CHECK4-NEXT: [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1 // CHECK4-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64 // CHECK4-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64 @@ -6870,13 +6870,13 @@ void test_omp_all_memory() // CHECK4-NEXT: [[TMP82:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-NEXT: [[TMP83:%.*]] = sext i8 [[TMP82]] to i64 // CHECK4-NEXT: [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]] -// CHECK4-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] +// CHECK4-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]] +// CHECK4-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] // CHECK4-NEXT: [[TMP85:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-NEXT: [[TMP86:%.*]] = sext i8 [[TMP85]] to i64 // CHECK4-NEXT: [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]] -// CHECK4-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] +// CHECK4-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]] +// CHECK4-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] // CHECK4-NEXT: [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK4-NEXT: [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK4-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64 @@ -6915,8 +6915,8 @@ void test_omp_all_memory() // CHECK4-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2 // CHECK4-NEXT: store i8 3, ptr [[TMP110]], align 8 // CHECK4-NEXT: [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]] -// CHECK4-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3 +// CHECK4-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]] +// CHECK4-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3 // CHECK4-NEXT: [[TMP112:%.*]] = load i32, ptr @a, align 4 // CHECK4-NEXT: [[TMP113:%.*]] = sext i32 [[TMP112]] to i64 // CHECK4-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1 @@ -6924,8 +6924,8 @@ void test_omp_all_memory() // CHECK4-NEXT: [[TMP115:%.*]] = sext i32 [[TMP114]] to i64 // CHECK4-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]] // CHECK4-NEXT: [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]] -// CHECK4-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]] -// CHECK4-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] +// CHECK4-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]] +// CHECK4-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] // CHECK4-NEXT: [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK4-NEXT: [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64 // CHECK4-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64 @@ -7808,9 +7808,9 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2 // CHECK3-51-NEXT: store i8 1, ptr [[TMP32]], align 8 // CHECK3-51-NEXT: [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]] +// CHECK3-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]] // CHECK3-51-NEXT: [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK3-51-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK3-51-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1 // CHECK3-51-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK3-51-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64 @@ -7849,13 +7849,13 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP57:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-51-NEXT: [[TMP58:%.*]] = sext i8 [[TMP57]] to i64 // CHECK3-51-NEXT: [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]] -// CHECK3-51-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] +// CHECK3-51-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]] +// CHECK3-51-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] // CHECK3-51-NEXT: [[TMP60:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-51-NEXT: [[TMP61:%.*]] = sext i8 [[TMP60]] to i64 // CHECK3-51-NEXT: [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]] -// CHECK3-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] +// CHECK3-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]] +// CHECK3-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] // CHECK3-51-NEXT: [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1 // CHECK3-51-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64 // CHECK3-51-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64 @@ -7889,13 +7889,13 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP82:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-51-NEXT: [[TMP83:%.*]] = sext i8 [[TMP82]] to i64 // CHECK3-51-NEXT: [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]] -// CHECK3-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] +// CHECK3-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]] +// CHECK3-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] // CHECK3-51-NEXT: [[TMP85:%.*]] = load i8, ptr [[B]], align 1 // CHECK3-51-NEXT: [[TMP86:%.*]] = sext i8 [[TMP85]] to i64 // CHECK3-51-NEXT: [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]] -// CHECK3-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] +// CHECK3-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]] +// CHECK3-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] // CHECK3-51-NEXT: [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK3-51-NEXT: [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK3-51-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64 @@ -7934,8 +7934,8 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2 // CHECK3-51-NEXT: store i8 3, ptr [[TMP110]], align 8 // CHECK3-51-NEXT: [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]] -// CHECK3-51-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3 +// CHECK3-51-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]] +// CHECK3-51-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3 // CHECK3-51-NEXT: [[TMP112:%.*]] = load i32, ptr @a, align 4 // CHECK3-51-NEXT: [[TMP113:%.*]] = sext i32 [[TMP112]] to i64 // CHECK3-51-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1 @@ -7943,8 +7943,8 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP115:%.*]] = sext i32 [[TMP114]] to i64 // CHECK3-51-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]] // CHECK3-51-NEXT: [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]] -// CHECK3-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] +// CHECK3-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]] +// CHECK3-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] // CHECK3-51-NEXT: [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK3-51-NEXT: [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64 // CHECK3-51-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64 @@ -7981,8 +7981,8 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP138:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP135]], i32 0, i32 2 // CHECK3-51-NEXT: store i8 8, ptr [[TMP138]], align 8 // CHECK3-51-NEXT: [[TMP139:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP139]] -// CHECK3-51-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX43]], i64 3 +// CHECK3-51-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP139]] +// CHECK3-51-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX43]], i64 3 // CHECK3-51-NEXT: [[TMP140:%.*]] = load i32, ptr @a, align 4 // CHECK3-51-NEXT: [[TMP141:%.*]] = sext i32 [[TMP140]] to i64 // CHECK3-51-NEXT: [[LEN_SUB_145:%.*]] = sub nsw i64 [[TMP141]], 1 @@ -7990,8 +7990,8 @@ void test_omp_all_memory() // CHECK3-51-NEXT: [[TMP143:%.*]] = sext i32 [[TMP142]] to i64 // CHECK3-51-NEXT: [[LB_ADD_LEN46:%.*]] = add nsw i64 -1, [[TMP143]] // CHECK3-51-NEXT: [[TMP144:%.*]] = mul nsw i64 [[LB_ADD_LEN46]], [[TMP1]] -// CHECK3-51-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP144]] -// CHECK3-51-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]] +// CHECK3-51-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP144]] +// CHECK3-51-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]] // CHECK3-51-NEXT: [[TMP145:%.*]] = getelementptr i32, ptr [[ARRAYIDX48]], i32 1 // CHECK3-51-NEXT: [[TMP146:%.*]] = ptrtoint ptr [[ARRAYIDX44]] to i64 // CHECK3-51-NEXT: [[TMP147:%.*]] = ptrtoint ptr [[TMP145]] to i64 @@ -9323,9 +9323,9 @@ void test_omp_all_memory() // CHECK4-51-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2 // CHECK4-51-NEXT: store i8 1, ptr [[TMP32]], align 8 // CHECK4-51-NEXT: [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]] +// CHECK4-51-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]] // CHECK4-51-NEXT: [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]] +// CHECK4-51-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]] // CHECK4-51-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1 // CHECK4-51-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64 // CHECK4-51-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64 @@ -9364,13 +9364,13 @@ void test_omp_all_memory() // CHECK4-51-NEXT: [[TMP57:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-51-NEXT: [[TMP58:%.*]] = sext i8 [[TMP57]] to i64 // CHECK4-51-NEXT: [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]] -// CHECK4-51-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] +// CHECK4-51-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]] +// CHECK4-51-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]] // CHECK4-51-NEXT: [[TMP60:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-51-NEXT: [[TMP61:%.*]] = sext i8 [[TMP60]] to i64 // CHECK4-51-NEXT: [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]] -// CHECK4-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] +// CHECK4-51-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]] +// CHECK4-51-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]] // CHECK4-51-NEXT: [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1 // CHECK4-51-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64 // CHECK4-51-NEXT: [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64 @@ -9404,13 +9404,13 @@ void test_omp_all_memory() // CHECK4-51-NEXT: [[TMP82:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-51-NEXT: [[TMP83:%.*]] = sext i8 [[TMP82]] to i64 // CHECK4-51-NEXT: [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]] -// CHECK4-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] +// CHECK4-51-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]] +// CHECK4-51-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]] // CHECK4-51-NEXT: [[TMP85:%.*]] = load i8, ptr [[B]], align 1 // CHECK4-51-NEXT: [[TMP86:%.*]] = sext i8 [[TMP85]] to i64 // CHECK4-51-NEXT: [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]] -// CHECK4-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] +// CHECK4-51-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]] +// CHECK4-51-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]] // CHECK4-51-NEXT: [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1 // CHECK4-51-NEXT: [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64 // CHECK4-51-NEXT: [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64 @@ -9449,8 +9449,8 @@ void test_omp_all_memory() // CHECK4-51-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2 // CHECK4-51-NEXT: store i8 3, ptr [[TMP110]], align 8 // CHECK4-51-NEXT: [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]] -// CHECK4-51-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3 +// CHECK4-51-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]] +// CHECK4-51-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3 // CHECK4-51-NEXT: [[TMP112:%.*]] = load i32, ptr @a, align 4 // CHECK4-51-NEXT: [[TMP113:%.*]] = sext i32 [[TMP112]] to i64 // CHECK4-51-NEXT: [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1 @@ -9458,8 +9458,8 @@ void test_omp_all_memory() // CHECK4-51-NEXT: [[TMP115:%.*]] = sext i32 [[TMP114]] to i64 // CHECK4-51-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]] // CHECK4-51-NEXT: [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]] -// CHECK4-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]] -// CHECK4-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] +// CHECK4-51-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]] +// CHECK4-51-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]] // CHECK4-51-NEXT: [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1 // CHECK4-51-NEXT: [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64 // CHECK4-51-NEXT: [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64 diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp index aa2a478137990..29dc12978d7d9 100644 --- a/clang/test/OpenMP/task_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp @@ -90,7 +90,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -105,7 +105,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -120,7 +120,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -138,7 +138,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP26]], align 8 // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -153,7 +153,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp index faf86479dfdae..a8577c7a13579 100644 --- a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp @@ -67,7 +67,7 @@ int main(int argc, char **argv) { // CHECK-DAG: [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA:%[^,]+]], i32 0, i32 0 // CHECK-DAG: store ptr [[A]], ptr [[A_REF:[^,]+]], // CHECK-DAG: [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 1 -// CHECK-DAG: [[GEPA]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 +// CHECK-DAG: [[GEPA]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 // CHECK-DAG: [[TMP6:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 2 // CHECK-DAG: store i64 4, ptr [[TMP6]], // CHECK-DAG: [[TMP7:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 3 @@ -82,7 +82,7 @@ int main(int argc, char **argv) { // CHECK-DAG: [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB:%[^,]+]], i32 0, i32 0 // CHECK-DAG: store ptr [[B]], ptr [[TMP12:%[^,]+]], // CHECK-DAG: [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 1 -// CHECK-DAG: [[GEPB]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 +// CHECK-DAG: [[GEPB]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 // CHECK-DAG: [[TMP14:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 2 // CHECK-DAG: store i64 4, ptr [[TMP14]], // CHECK-DAG: [[TMP15:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 3 @@ -97,7 +97,7 @@ int main(int argc, char **argv) { // CHECK-DAG: [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC:%[^,]+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARGC_ADDR]], ptr [[TMP20:%[^,]+]], // CHECK-DAG: [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 1 -// CHECK-DAG: [[GEPARGC]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 +// CHECK-DAG: [[GEPARGC]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64 // CHECK-DAG: [[TMP22:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 2 // CHECK-DAG: store i64 4, ptr [[TMP22]], // CHECK-DAG: [[TMP23:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 3 @@ -116,7 +116,7 @@ int main(int argc, char **argv) { // CHECK-DAG: [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC:%[^,]+]], i32 0, i32 0 // CHECK-DAG: store ptr [[C]], ptr [[TMP30:%[^,]+]], // CHECK-DAG: [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 1 -// CHECK-DAG: [[GEPC]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64 +// CHECK-DAG: [[GEPC]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64 // CHECK-DAG: [[TMP32:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 2 // CHECK-DAG: store i64 20, ptr [[TMP32]], // CHECK-DAG: [[TMP33:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 3 @@ -131,7 +131,7 @@ int main(int argc, char **argv) { // CHECK-DAG: [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA:%[^,]+]], i32 0, i32 0 // CHECK-DAG: store ptr [[VLA]], ptr [[TMP38:%[^,]+]], // CHECK-DAG: [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 1 -// CHECK-DAG: [[GEPVLA]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64 +// CHECK-DAG: [[GEPVLA]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64 // CHECK-DAG: [[TMP40:%.+]] = mul nuw i64 [[VLA_SIZE]], 2 // CHECK-DAG: [[TMP41:%.+]] = udiv exact i64 [[TMP40]], ptrtoint (ptr getelementptr (i16, ptr null, i32 1) to i64) // CHECK-DAG: [[TMP42:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 2 diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp index ae0d007756140..87b4cd2caf18a 100644 --- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp @@ -76,7 +76,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -91,7 +91,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -106,7 +106,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -124,7 +124,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP26]], align 8 // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp index 3cdc88ba20b77..6eca033eca551 100644 --- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp @@ -83,9 +83,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -137,10 +137,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp index 6da28d2d973c9..9e4e51a442742 100644 --- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp @@ -76,7 +76,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -91,7 +91,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[B]], ptr [[TMP11]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1 @@ -106,7 +106,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..2, ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1 @@ -124,7 +124,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]]) // CHECK1-NEXT: store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[C]], ptr [[TMP26]], align 8 // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1 @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..6, ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP33]], align 8 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1 diff --git a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp index d6e40831484aa..83ae053cfd9bd 100644 --- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp @@ -80,9 +80,9 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB1:.+]], ptr [[TMP25]], // CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false) -// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0 +// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0 // CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, % -// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] +// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]] // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], // CHECK-DAG: [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0 // CHECK-DAG: store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]], @@ -134,10 +134,10 @@ sum = 0.0; // CHECK-DAG: store ptr @[[RED_COMB4:.+]], ptr [[TMP59]], // CHECK-DAG: [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6 // CHECK-DAG: store i32 1, ptr [[TMP60]], -// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 -// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 +// CHECK-DAG: [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 // CHECK: [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]]) // CHECK: [[TMP63:%.*]] = load i32, ptr [[N]], // CHECK: store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]], diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp index be499e0b36548..7987c2de7dd8f 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp @@ -100,16 +100,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]] // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]] // CHECK1-NEXT: store ptr [[_TMP5]], ptr [[TMP]], align 8 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[_TMP5]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP22]], align 8 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -154,19 +154,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb., ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0 // CHECK1-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0 // CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]] // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9 // CHECK1-NEXT: [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8 @@ -444,16 +444,16 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK1-NEXT: store i32 0, ptr [[ARGC1]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]] // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]] // CHECK1-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]] @@ -483,7 +483,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]] // CHECK1-NEXT: store ptr [[_TMP6]], ptr [[_TMP5]], align 8 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[_TMP6]], align 8 -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[ARGC1]], ptr [[TMP24]], align 8 // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1 @@ -498,19 +498,19 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store ptr @.red_comb..4, ptr [[TMP29]], align 8 // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6 // CHECK1-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false) -// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1 +// CHECK1-NEXT: [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1 // CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0 // CHECK1-NEXT: [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0 // CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64 // CHECK1-NEXT: [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]] // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9 // CHECK1-NEXT: [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8 -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]] // CHECK1-NEXT: store ptr [[VLA]], ptr [[TMP31]], align 8 // CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1 // CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8 @@ -831,9 +831,9 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8 -// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9 +// CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9 // CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8 -// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] +// CHECK1-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]] // CHECK1-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64 // CHECK1-NEXT: [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64 // CHECK1-NEXT: [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]] From 122874c955e06defb619b1afd4e26db482dbbf19 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Sep 2024 16:17:18 +0100 Subject: [PATCH 239/425] [X86] Fold scalar_to_vector(shift(x,imm)) -> vshift(scalar_to_vector(x),imm) Noticed while working on #107289 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++- llvm/test/CodeGen/X86/buildvec-insertvec.ll | 4 +- .../test/CodeGen/X86/known-signbits-vector.ll | 7 +- .../test/CodeGen/X86/load-scalar-as-vector.ll | 18 ++-- llvm/test/CodeGen/X86/pr44915.ll | 15 ++-- llvm/test/CodeGen/X86/vec_insert-5.ll | 7 +- llvm/test/CodeGen/X86/vec_shift5.ll | 16 ++-- llvm/test/CodeGen/X86/vector-sext.ll | 83 +++++++++---------- .../CodeGen/X86/vector-shuffle-combining.ll | 36 ++++---- 9 files changed, 109 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5f87ffd2f1eab..a4ad4a1bb1201 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57403,7 +57403,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { +static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); @@ -57482,6 +57483,25 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { // coverage. } + // Check for cases where we've ended up with a scalarized shift, typically + // during type legalization. + switch (Src.getOpcode()) { + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + if (auto *Amt = dyn_cast(Src.getOperand(1))) { + if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) && + Src.hasOneUse()) { + SDValue SrcVec = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0)); + unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false); + return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec, + Amt->getZExtValue(), DAG); + } + } + break; + } + return SDValue(); } @@ -58034,7 +58054,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, // clang-format off default: break; case ISD::SCALAR_TO_VECTOR: - return combineScalarToVector(N, DAG); + return combineScalarToVector(N, DAG, Subtarget); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index ae70b6a5a4665..4b0e5441b4abf 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -726,9 +726,9 @@ define void @PR46461(i16 %x, ptr %y) { ; SSE-LABEL: PR46461: ; SSE: # %bb.0: ; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: shrl %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrld $1, %xmm0 ; SSE-NEXT: movdqa %xmm0, 48(%rsi) ; SSE-NEXT: movdqa %xmm0, 32(%rsi) ; SSE-NEXT: movdqa %xmm0, 16(%rsi) @@ -738,9 +738,9 @@ define void @PR46461(i16 %x, ptr %y) { ; AVX1-LABEL: PR46461: ; AVX1: # %bb.0: ; AVX1-NEXT: movzwl %di, %eax -; AVX1-NEXT: shrl %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-NEXT: vmovaps %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 9648daf7427b1..45b61155fe626 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -220,11 +220,10 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $30, %ecx -; X86-NEXT: shll $2, %eax ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X86-NEXT: sarl $30, %eax +; X86-NEXT: vpslld $2, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; X86-NEXT: vpsrlq $3, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll index 3edbcd1fe18eb..d2359ced3e19d 100644 --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -274,16 +274,14 @@ define <2 x i64> @lshr_op0_constant(ptr %p) nounwind { define <4 x i32> @lshr_op1_constant(ptr %p) nounwind { ; SSE-LABEL: lshr_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: shrl $17, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: psrld $17, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: lshr_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: shrl $17, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load i32, ptr %p %b = lshr i32 %x, 17 @@ -317,15 +315,15 @@ define <8 x i16> @ashr_op1_constant(ptr %p) nounwind { ; SSE-LABEL: ashr_op1_constant: ; SSE: # %bb.0: ; SSE-NEXT: movswl (%rdi), %eax -; SSE-NEXT: sarl $7, %eax ; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: psrad $7, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ashr_op1_constant: ; AVX: # %bb.0: ; AVX-NEXT: movswl (%rdi), %eax -; AVX-NEXT: sarl $7, %eax ; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpsrad $7, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load i16, ptr %p %b = ashr i16 %x, 7 @@ -474,8 +472,8 @@ define <2 x i64> @udiv_op1_constant(ptr %p) nounwind { ; SSE-NEXT: shrq %rax ; SSE-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D ; SSE-NEXT: mulq %rcx -; SSE-NEXT: shrq $4, %rdx ; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: psrlq $4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: udiv_op1_constant: @@ -484,8 +482,8 @@ define <2 x i64> @udiv_op1_constant(ptr %p) nounwind { ; AVX-NEXT: shrq %rax ; AVX-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D ; AVX-NEXT: mulq %rcx -; AVX-NEXT: shrq $4, %rdx ; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: vpsrlq $4, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load i64, ptr %p %b = udiv i64 %x, 42 diff --git a/llvm/test/CodeGen/X86/pr44915.ll b/llvm/test/CodeGen/X86/pr44915.ll index 1ebdd9ccb3190..99205ab60ae11 100644 --- a/llvm/test/CodeGen/X86/pr44915.ll +++ b/llvm/test/CodeGen/X86/pr44915.ll @@ -52,15 +52,14 @@ define i32 @extract3(ptr, i32) nounwind { ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $7, %ecx ; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $7, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $7, %eax +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: shrl $12, %eax -; X64-NEXT: movd %eax, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: psrld $12, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: movdqa %xmm0, -24(%rsp) ; X64-NEXT: andl $7, %esi ; X64-NEXT: movzwl -24(%rsp,%rsi,2), %eax diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll index 176ae81e08a76..91743898545ee 100644 --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -9,17 +9,16 @@ define void @t1(i32 %a, ptr %P) nounwind { ; X86-LABEL: t1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $12, %ecx -; X86-NEXT: movd %ecx, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pslld $12, %xmm0 ; X86-NEXT: psllq $32, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: shll $12, %edi ; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pslld $12, %xmm0 ; X64-NEXT: psllq $32, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll index f8bc6b01c70a8..2ab00ea96ada1 100644 --- a/llvm/test/CodeGen/X86/vec_shift5.ll +++ b/llvm/test/CodeGen/X86/vec_shift5.ll @@ -290,13 +290,15 @@ define <4 x i32> @extelt0_twice_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x ; This would crash because the scalar shift amount has a different type than the shift result. define <2 x i8> @PR58661(<2 x i8> %a0) { -; CHECK-LABEL: PR58661: -; CHECK: # %bb.0: -; CHECK-NEXT: psrlw $8, %xmm0 -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: PR58661: +; X86: # %bb.0: +; X86-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: PR58661: +; X64: # %bb.0: +; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: retq %shuffle = shufflevector <2 x i8> %a0, <2 x i8> , <2 x i32> %x = bitcast <2 x i8> %shuffle to i16 %shl = shl nuw i16 %x, 8 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index dc9e69137a8a7..d44b11f4ca1da 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -3601,15 +3601,14 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; SSE2-LABEL: sext_4i17_to_4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shll $15, %ecx -; SSE2-NEXT: sarl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslld $15, %xmm0 +; SSE2-NEXT: psrad $15, %xmm0 ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq $17, %rcx -; SSE2-NEXT: shll $15, %ecx -; SSE2-NEXT: sarl $15, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslld $15, %xmm1 +; SSE2-NEXT: psrad $15, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movl 8(%rdi), %ecx ; SSE2-NEXT: shll $28, %ecx @@ -3617,12 +3616,12 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; SSE2-NEXT: shrq $51, %rdx ; SSE2-NEXT: shll $15, %edx ; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: sarl $15, %edx ; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: psrad $15, %xmm1 ; SSE2-NEXT: shrq $34, %rax -; SSE2-NEXT: shll $15, %eax -; SSE2-NEXT: sarl $15, %eax ; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pslld $15, %xmm2 +; SSE2-NEXT: psrad $15, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq @@ -3630,15 +3629,14 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; SSSE3-LABEL: sext_4i17_to_4i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq (%rdi), %rax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shll $15, %ecx -; SSSE3-NEXT: sarl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pslld $15, %xmm0 +; SSSE3-NEXT: psrad $15, %xmm0 ; SSSE3-NEXT: movq %rax, %rcx ; SSSE3-NEXT: shrq $17, %rcx -; SSSE3-NEXT: shll $15, %ecx -; SSSE3-NEXT: sarl $15, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pslld $15, %xmm1 +; SSSE3-NEXT: psrad $15, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movl 8(%rdi), %ecx ; SSSE3-NEXT: shll $28, %ecx @@ -3646,12 +3644,12 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; SSSE3-NEXT: shrq $51, %rdx ; SSSE3-NEXT: shll $15, %edx ; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: sarl $15, %edx ; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: psrad $15, %xmm1 ; SSSE3-NEXT: shrq $34, %rax -; SSSE3-NEXT: shll $15, %eax -; SSSE3-NEXT: sarl $15, %eax ; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: pslld $15, %xmm2 +; SSSE3-NEXT: psrad $15, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq @@ -3663,10 +3661,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; SSE41-NEXT: shrq $17, %rcx ; SSE41-NEXT: shll $15, %ecx ; SSE41-NEXT: sarl $15, %ecx -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: shll $15, %edx -; SSE41-NEXT: sarl $15, %edx -; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pslld $15, %xmm0 +; SSE41-NEXT: psrad $15, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq $34, %rcx @@ -3689,10 +3686,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; AVX-NEXT: shrq $17, %rcx ; AVX-NEXT: shll $15, %ecx ; AVX-NEXT: sarl $15, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: shll $15, %edx -; AVX-NEXT: sarl $15, %edx -; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslld $15, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $15, %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $34, %rcx @@ -3711,25 +3707,24 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; X86-SSE2-LABEL: sext_4i17_to_4i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl (%edx), %ecx -; X86-SSE2-NEXT: movl 4(%edx), %eax +; X86-SSE2-NEXT: movl (%edx), %eax +; X86-SSE2-NEXT: movl 4(%edx), %ecx ; X86-SSE2-NEXT: movl 8(%edx), %edx -; X86-SSE2-NEXT: shldl $13, %eax, %edx -; X86-SSE2-NEXT: shll $15, %edx -; X86-SSE2-NEXT: sarl $15, %edx +; X86-SSE2-NEXT: shldl $13, %ecx, %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 -; X86-SSE2-NEXT: movl %eax, %edx -; X86-SSE2-NEXT: shll $13, %edx -; X86-SSE2-NEXT: sarl $15, %edx -; X86-SSE2-NEXT: movd %edx, %xmm1 +; X86-SSE2-NEXT: pslld $15, %xmm0 +; X86-SSE2-NEXT: psrad $15, %xmm0 +; X86-SSE2-NEXT: movd %ecx, %xmm1 +; X86-SSE2-NEXT: pslld $13, %xmm1 +; X86-SSE2-NEXT: psrad $15, %xmm1 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: shldl $15, %ecx, %eax -; X86-SSE2-NEXT: shll $15, %ecx -; X86-SSE2-NEXT: sarl $15, %ecx -; X86-SSE2-NEXT: movd %ecx, %xmm0 -; X86-SSE2-NEXT: shll $15, %eax -; X86-SSE2-NEXT: sarl $15, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: pslld $15, %xmm0 +; X86-SSE2-NEXT: psrad $15, %xmm0 +; X86-SSE2-NEXT: shldl $15, %eax, %ecx +; X86-SSE2-NEXT: movd %ecx, %xmm2 +; X86-SSE2-NEXT: pslld $15, %xmm2 +; X86-SSE2-NEXT: psrad $15, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-SSE2-NEXT: retl @@ -3748,9 +3743,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; X86-SSE41-NEXT: shldl $15, %eax, %ecx ; X86-SSE41-NEXT: shll $15, %ecx ; X86-SSE41-NEXT: sarl $15, %ecx -; X86-SSE41-NEXT: shll $15, %eax -; X86-SSE41-NEXT: sarl $15, %eax ; X86-SSE41-NEXT: movd %eax, %xmm0 +; X86-SSE41-NEXT: pslld $15, %xmm0 +; X86-SSE41-NEXT: psrad $15, %xmm0 ; X86-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 ; X86-SSE41-NEXT: shll $13, %esi ; X86-SSE41-NEXT: sarl $15, %esi diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 923af983f1d47..04262b4249256 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3537,47 +3537,43 @@ define <16 x i8> @PR107289(<16 x i8> %0) { ; SSE2-LABEL: PR107289: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm1, %rcx ; SSE2-NEXT: shldq $8, %rax, %rcx -; SSE2-NEXT: shlq $8, %rax ; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: psllq $8, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR107289: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: shldq $8, %rax, %rcx -; SSSE3-NEXT: shlq $8, %rax ; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: psllq $8, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR107289: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: shldq $8, %rcx, %rax -; SSE41-NEXT: shlq $8, %rcx -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: shldq $8, %rax, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: psllq $8, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: PR107289: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: shldq $8, %rcx, %rax -; AVX-NEXT: shlq $8, %rcx -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: shldq $8, %rax, %rcx ; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vpsllq $8, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %src = bitcast <16 x i8> %0 to i128 %shl = shl i128 %src, 8 From 0c8d6df362fe5b4bce54776e2199623d0382293b Mon Sep 17 00:00:00 2001 From: erichkeane Date: Thu, 5 Sep 2024 08:18:41 -0700 Subject: [PATCH 240/425] Fix handling of FP-classify where the last arg fails to convert The last argument of an FP-classify function was checked for vailidity as an expression, but we never ensured that the usual unary conversions/etc properly resulted in a valid value. Thus, when we got the value, it was null, so we had a null dereference. This patch instead fails out/marks the function call as invalid if the argument is incorrect. I DID consider just allowing it to continue, but the result was an extraneous error about how the last argument wasn't a float (in this case, it was an overload set). Fixes: #107411 --- clang/lib/Sema/SemaChecking.cpp | 17 +++++++++++++---- clang/test/Sema/builtin-unary-fp.c | 4 ++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 2aab52160afa7..99500daca295c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4936,10 +4936,19 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs, // Usual Unary Conversions will convert half to float, which we want for // machines that use fp16 conversion intrinsics. Else, we wnat to leave the // type how it is, but do normal L->Rvalue conversions. - if (Context.getTargetInfo().useFP16ConversionIntrinsics()) - OrigArg = UsualUnaryConversions(OrigArg).get(); - else - OrigArg = DefaultFunctionArrayLvalueConversion(OrigArg).get(); + if (Context.getTargetInfo().useFP16ConversionIntrinsics()) { + ExprResult Res = UsualUnaryConversions(OrigArg); + + if (!Res.isUsable()) + return true; + OrigArg = Res.get(); + } else { + ExprResult Res = DefaultFunctionArrayLvalueConversion(OrigArg); + + if (!Res.isUsable()) + return true; + OrigArg = Res.get(); + } TheCall->setArg(FPArgNo, OrigArg); QualType VectorResultTy; diff --git a/clang/test/Sema/builtin-unary-fp.c b/clang/test/Sema/builtin-unary-fp.c index 3f4f65eeb73a1..fb8e341156a59 100644 --- a/clang/test/Sema/builtin-unary-fp.c +++ b/clang/test/Sema/builtin-unary-fp.c @@ -14,4 +14,8 @@ void a(void) { check(__builtin_fpclassify(0, 1, 2, 3, 4.5, 5.0)); // expected-warning{{implicit conversion from 'double' to 'int' changes value from 4.5 to 4}} check(__builtin_fpclassify(0, 0, 0, 0, 1)); // expected-error{{too few arguments}} check(__builtin_fpclassify(0, 0, 0, 0, 0, 1, 0)); // expected-error{{too many arguments}} + + check(__builtin_fpclassify(0,0,0,0,0, (invalid))); // expected-error{{use of undeclared identifier 'invalid'}} + check(__builtin_fpclassify(0,0,0,0,0, (inf))); // expected-error{{use of undeclared identifier 'inf'}} + // expected-error@-1{{reference to overloaded function could not be resolved}} } From aea3b0f6838bd8268fc3653e1b662d771c87ab15 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 08:35:18 -0700 Subject: [PATCH 241/425] [ARM] Avoid repeated hash lookups (NFC) (#107356) --- llvm/lib/Target/ARM/ARMMachineFunctionInfo.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index b9ff3a08f998f..54bf5fffd3942 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -254,13 +254,9 @@ class ARMFunctionInfo : public MachineFunctionInfo { return -1U; } - DenseMap::iterator getCoalescedWeight( - MachineBasicBlock* MBB) { - auto It = CoalescedWeights.find(MBB); - if (It == CoalescedWeights.end()) { - It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first; - } - return It; + DenseMap::iterator + getCoalescedWeight(MachineBasicBlock *MBB) { + return CoalescedWeights.try_emplace(MBB, 0).first; } /// Indicate to the backend that \c GV has had its storage changed to inside From abfb340b779f2b20009fe42ebc522417adf79c44 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 08:35:46 -0700 Subject: [PATCH 242/425] [Analysis] Avoid repeated hash lookups (NFC) (#107357) --- clang/lib/Analysis/ThreadSafety.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp index c4a83b069e079..5577f45aa5217 100644 --- a/clang/lib/Analysis/ThreadSafety.cpp +++ b/clang/lib/Analysis/ThreadSafety.cpp @@ -1180,8 +1180,7 @@ void BeforeSet::checkBeforeAfter(const ValueDecl* StartVd, } // Transitively search other before sets, and warn on cycles. if (traverse(Vdb)) { - if (!CycMap.contains(Vd)) { - CycMap.insert(std::make_pair(Vd, true)); + if (CycMap.try_emplace(Vd, true).second) { StringRef L1 = Vd->getName(); Analyzer.Handler.handleBeforeAfterCycle(L1, Vd->getLocation()); } From 0593b95ff4c459ccf71f9472c148967d40f6d865 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 08:36:09 -0700 Subject: [PATCH 243/425] [CGOpenMPRuntime] Avoid repeated hash lookups (NFC) (#107358) --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3d392d869ee39..9cf597a65be04 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -11674,9 +11674,7 @@ CGOpenMPRuntime::LastprivateConditionalRAII::~LastprivateConditionalRAII() { Address CGOpenMPRuntime::emitLastprivateConditionalInit(CodeGenFunction &CGF, const VarDecl *VD) { ASTContext &C = CGM.getContext(); - auto I = LastprivateConditionalToTypes.find(CGF.CurFn); - if (I == LastprivateConditionalToTypes.end()) - I = LastprivateConditionalToTypes.try_emplace(CGF.CurFn).first; + auto I = LastprivateConditionalToTypes.try_emplace(CGF.CurFn).first; QualType NewType; const FieldDecl *VDField; const FieldDecl *FiredField; From be427dfb9ea6689947253d737708dc3645e179dc Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 5 Sep 2024 16:38:08 +0100 Subject: [PATCH 244/425] [Clang][Parser] Accept P2741R3 (static_assert with user-generated message) in C++11 as an extension (#102044) Added a new `-Wpre-c++26-compat` warning for when this feature is used in C++26 and a `-Wc++26-extensions` warning for when this is used in C++11 through C++23. --------- Co-authored-by: cor3ntin --- clang/docs/LanguageExtensions.rst | 2 ++ clang/docs/ReleaseNotes.rst | 3 ++ .../clang/Basic/DiagnosticParseKinds.td | 6 ++++ clang/lib/Frontend/InitPreprocessor.cpp | 7 ++--- clang/lib/Parse/ParseDeclCXX.cpp | 10 +++++-- clang/test/CXX/drs/cwg27xx.cpp | 2 +- clang/test/Lexer/cxx-features.cpp | 2 +- .../Parser/cxx11-user-defined-literals.cpp | 3 +- clang/test/Sema/static-assert.c | 8 ++++-- clang/test/SemaCXX/static-assert-ext.cpp | 28 +++++++++++++++++++ 10 files changed, 59 insertions(+), 12 deletions(-) create mode 100644 clang/test/SemaCXX/static-assert-ext.cpp diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 62903fc3744ca..c08697282cbfe 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1483,6 +1483,7 @@ Generic lambda expressions __cpp_generic_lambdas C+ variable templates __cpp_variable_templates C++14 C++03 Binary literals __cpp_binary_literals C++14 C++03 Relaxed constexpr __cpp_constexpr C++14 C++11 +Static assert with no message __cpp_static_assert >= 201411L C++17 C++11 Pack expansion in generalized lambda-capture __cpp_init_captures C++17 C++03 ``if constexpr`` __cpp_if_constexpr C++17 C++11 fold expressions __cpp_fold_expressions C++17 C++03 @@ -1503,6 +1504,7 @@ Conditional ``explicit`` __cpp_conditional_explicit C+ ``static operator()`` __cpp_static_call_operator C++23 C++03 Attributes on Lambda-Expressions C++23 C++11 Attributes on Structured Bindings __cpp_structured_bindings C++26 C++03 +Static assert with user-generated message __cpp_static_assert >= 202306L C++26 C++11 Pack Indexing __cpp_pack_indexing C++26 C++03 ``= delete ("should have a reason");`` __cpp_deleted_function C++26 C++03 Variadic Friends __cpp_variadic_friend C++26 C++03 diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ab3c3e6049f60..ebd0b7371e1be 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -109,6 +109,9 @@ C++ Language Changes constant expression. Supports the `V.xyzw` syntax and other tidbits as seen in OpenCL. Selecting multiple elements is left as a future work. +- Accept C++26 user-defined ``static_assert`` messages in C++11 as an extension. + + C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 0b8ab4bf09250..0aa2c4a70849a 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -470,6 +470,12 @@ def warn_c17_compat_static_assert_no_message : Warning< "'_Static_assert' with no message is incompatible with C standards before " "C23">, DefaultIgnore, InGroup; +def ext_cxx_static_assert_user_generated_message : ExtWarn< + "'static_assert' with a user-generated message is a C++26 extension">, + InGroup; +def warn_cxx20_compat_static_assert_user_generated_message : Warning< + "'static_assert' with a user-generated message is incompatible with " + "C++ standards before C++26">, DefaultIgnore, InGroup; def err_function_definition_not_allowed : Error< "function definition is not allowed here">; def err_expected_end_of_enumerator : Error< diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 61260a3379828..9a0fdb175ff29 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -671,10 +671,9 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts, LangOpts.CPlusPlus23 ? "202211L" : LangOpts.CPlusPlus17 ? "201603L" : "200907"); - Builder.defineMacro("__cpp_static_assert", LangOpts.CPlusPlus26 ? "202306L" - : LangOpts.CPlusPlus17 - ? "201411L" - : "200410"); + // C++17 / C++26 static_assert supported as an extension in earlier language + // modes, so we use the C++26 value. + Builder.defineMacro("__cpp_static_assert", "202306L"); Builder.defineMacro("__cpp_decltype", "200707L"); Builder.defineMacro("__cpp_attributes", "200809L"); Builder.defineMacro("__cpp_rvalue_references", "200610L"); diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 7ca27d00c0bcb..6370da1fab004 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -1076,7 +1076,7 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { } bool ParseAsExpression = false; - if (getLangOpts().CPlusPlus26) { + if (getLangOpts().CPlusPlus11) { for (unsigned I = 0;; ++I) { const Token &T = GetLookAheadToken(I); if (T.is(tok::r_paren)) @@ -1088,9 +1088,13 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { } } - if (ParseAsExpression) + if (ParseAsExpression) { + Diag(Tok, + getLangOpts().CPlusPlus26 + ? diag::warn_cxx20_compat_static_assert_user_generated_message + : diag::ext_cxx_static_assert_user_generated_message); AssertMessage = ParseConstantExpressionInExprEvalContext(); - else if (tokenIsLikeStringLiteral(Tok, getLangOpts())) + } else if (tokenIsLikeStringLiteral(Tok, getLangOpts())) AssertMessage = ParseUnevaluatedStringLiteralExpression(); else { Diag(Tok, diag::err_expected_string_literal) diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp index b3867696c615b..2b57dbc60aed7 100644 --- a/clang/test/CXX/drs/cwg27xx.cpp +++ b/clang/test/CXX/drs/cwg27xx.cpp @@ -178,7 +178,7 @@ void test() { } namespace cwg2798 { // cwg2798: 17 -#if __cpp_static_assert >= 202306 +#if __cplusplus > 202302L struct string { constexpr string() { data_ = new char[6](); diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp index 4a06d29ae9dbc..5b88e00b71508 100644 --- a/clang/test/Lexer/cxx-features.cpp +++ b/clang/test/Lexer/cxx-features.cpp @@ -325,7 +325,7 @@ #error "wrong value for __cpp_range_based_for" #endif -#if check(static_assert, 0, 200410, 200410, 201411, 201411, 201411, 202306) +#if check(static_assert, 0, 202306, 202306, 202306, 202306, 202306, 202306) #error "wrong value for __cpp_static_assert" #endif diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp index 1a7e780588229..cdd06729efc39 100644 --- a/clang/test/Parser/cxx11-user-defined-literals.cpp +++ b/clang/test/Parser/cxx11-user-defined-literals.cpp @@ -21,7 +21,8 @@ int f() { asm("mov %eax, %rdx"_foo); // expected-error {{user-defined suffix cannot be used here}} } -static_assert(true, "foo"_bar); // expected-error {{user-defined suffix cannot be used here}} +static_assert(true, "foo"_bar); // expected-error {{no matching literal operator for call to 'operator""_bar'}} +// expected-warning@-1 {{'static_assert' with a user-generated message is a C++26 extension}} int cake() __attribute__((availability(macosx, unavailable, message = "is a lie"_x))); // expected-error {{user-defined suffix cannot be used here}} diff --git a/clang/test/Sema/static-assert.c b/clang/test/Sema/static-assert.c index 4e9e6b7ee558b..d603bc19bb824 100644 --- a/clang/test/Sema/static-assert.c +++ b/clang/test/Sema/static-assert.c @@ -25,8 +25,12 @@ void foo(void) { #endif } -_Static_assert(1, invalid); // expected-error {{expected string literal for diagnostic message in static_assert}} \ - // ext-warning {{'_Static_assert' is a C11 extension}} +_Static_assert(1, invalid); // ext-warning {{'_Static_assert' is a C11 extension}} +#ifndef __cplusplus +// expected-error@-2 {{expected string literal for diagnostic message in static_assert}} +#endif +// cxx-error@-4 {{use of undeclared identifier 'invalid'}} +// cxx-warning@-5 {{'static_assert' with a user-generated message is a C++26 extension}} struct A { int a; diff --git a/clang/test/SemaCXX/static-assert-ext.cpp b/clang/test/SemaCXX/static-assert-ext.cpp new file mode 100644 index 0000000000000..05f7a0e96974a --- /dev/null +++ b/clang/test/SemaCXX/static-assert-ext.cpp @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -std=c++98 -fsyntax-only -pedantic %s -verify=precxx11,precxx17,precxx26 +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -pedantic %s -verify=since-cxx11,precxx17,precxx26 -Wc++98-compat +// RUN: %clang_cc1 -std=c++17 -fsyntax-only -pedantic %s -verify=since-cxx11,since-cxx17,precxx26 -Wc++98-compat -Wpre-c++17-compat +// RUN: %clang_cc1 -std=c++26 -fsyntax-only -pedantic %s -verify=since-cxx11,since-cxx17,since-cxx26 -Wc++98-compat -Wpre-c++17-compat -Wpre-c++26-compat + +static_assert(false, "a"); +// precxx11-error@-1 {{a type specifier is required for all declarations}} +// since-cxx11-warning@-2 {{'static_assert' declarations are incompatible with C++98}} +// since-cxx11-error@-3 {{static assertion failed: a}} + +#if __cplusplus >= 201103L +static_assert(false); +// since-cxx11-warning@-1 {{'static_assert' declarations are incompatible with C++98}} +// precxx17-warning@-2 {{'static_assert' with no message is a C++17 extension}} +// since-cxx17-warning@-3 {{'static_assert' with no message is incompatible with C++ standards before C++17}} +// since-cxx11-error@-4 {{static assertion failed}} + +struct X { + static constexpr int size() { return 1; } // since-cxx11-warning {{'constexpr'}} + static constexpr const char* data() { return "b"; } // since-cxx11-warning {{'constexpr'}} +}; + +static_assert(false, X()); +// since-cxx11-warning@-1 {{'static_assert' declarations are incompatible with C++98}} +// precxx26-warning@-2 {{'static_assert' with a user-generated message is a C++26 extension}} +// since-cxx26-warning@-3 {{'static_assert' with a user-generated message is incompatible with C++ standards before C++26}} +// since-cxx11-error@-4 {{static assertion failed: b}} +#endif From 13013bdc6a5e4def05204fb69d7a31ef17ccd1c7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 5 Sep 2024 08:42:13 -0700 Subject: [PATCH 245/425] [RISCV] Don't cost Fmv for Zfinx in isFPImmLegal. (#107361) There is no Fmv with Zfinx. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +- llvm/test/CodeGen/RISCV/double-convert.ll | 36 +-- llvm/test/CodeGen/RISCV/double-imm.ll | 4 +- llvm/test/CodeGen/RISCV/double-intrinsics.ll | 24 +- llvm/test/CodeGen/RISCV/double-round-conv.ll | 20 +- llvm/test/CodeGen/RISCV/float-convert.ll | 32 +- .../CodeGen/RISCV/float-round-conv-sat.ll | 48 +-- llvm/test/CodeGen/RISCV/half-arith.ll | 41 +-- llvm/test/CodeGen/RISCV/half-convert.ll | 232 +++++++------- llvm/test/CodeGen/RISCV/half-imm.ll | 8 +- llvm/test/CodeGen/RISCV/half-intrinsics.ll | 24 +- .../test/CodeGen/RISCV/half-round-conv-sat.ll | 264 +++++++-------- llvm/test/CodeGen/RISCV/half-round-conv.ll | 300 +++++++++--------- 13 files changed, 512 insertions(+), 528 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d400b2ea1ca2c..6b4219b462384 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2288,10 +2288,11 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return true; // Building an integer and then converting requires a fmv at the end of - // the integer sequence. + // the integer sequence. The fmv is not required for Zfinx. + const int FmvCost = Subtarget.hasStdExtZfinx() ? 0 : 1; const int Cost = - 1 + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(), Subtarget.getXLen(), - Subtarget); + FmvCost + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(), + Subtarget.getXLen(), Subtarget); return Cost <= FPImmCost; } diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index 2e2e1b924cf00..ef2d8e7627be5 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -1668,16 +1668,16 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI26_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI26_0)(a1) -; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_1) -; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI26_1)(a2) -; RV64IZFINXZDINX-NEXT: feq.d a3, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a3, a3 -; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) +; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI26_0)(a2) +; RV64IZFINXZDINX-NEXT: li a3, -505 +; RV64IZFINXZDINX-NEXT: slli a3, a3, 53 +; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a3 ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz -; RV64IZFINXZDINX-NEXT: and a0, a3, a0 +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i16: @@ -2043,16 +2043,16 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI30_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI30_0)(a1) -; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI30_1) -; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI30_1)(a2) -; RV64IZFINXZDINX-NEXT: feq.d a3, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a3, a3 -; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: li a2, -509 +; RV64IZFINXZDINX-NEXT: slli a2, a2, 53 +; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a2 +; RV64IZFINXZDINX-NEXT: lui a2, 65919 +; RV64IZFINXZDINX-NEXT: slli a2, a2, 34 ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz -; RV64IZFINXZDINX-NEXT: and a0, a3, a0 +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i8: @@ -2234,9 +2234,9 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_wu_s_sat_i8: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI32_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI32_0)(a1) ; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, zero +; RV64IZFINXZDINX-NEXT: lui a1, 131967 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 33 ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a1 ; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rtz ; RV64IZFINXZDINX-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll index 74d4acc4f23f8..827f034f143fb 100644 --- a/llvm/test/CodeGen/RISCV/double-imm.ll +++ b/llvm/test/CodeGen/RISCV/double-imm.ll @@ -62,8 +62,8 @@ define double @double_imm_op(double %a) nounwind { ; ; CHECKRV64ZDINX-LABEL: double_imm_op: ; CHECKRV64ZDINX: # %bb.0: -; CHECKRV64ZDINX-NEXT: lui a1, %hi(.LCPI1_0) -; CHECKRV64ZDINX-NEXT: ld a1, %lo(.LCPI1_0)(a1) +; CHECKRV64ZDINX-NEXT: li a1, 1023 +; CHECKRV64ZDINX-NEXT: slli a1, a1, 52 ; CHECKRV64ZDINX-NEXT: fadd.d a0, a0, a1 ; CHECKRV64ZDINX-NEXT: ret %1 = fadd double %a, 1.0 diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll index eef48d1eafbfe..94b3b1f1b199c 100644 --- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll @@ -869,8 +869,8 @@ define double @floor_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: floor_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI17_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI17_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB17_2 @@ -934,8 +934,8 @@ define double @ceil_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: ceil_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI18_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI18_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB18_2 @@ -999,8 +999,8 @@ define double @trunc_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: trunc_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI19_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI19_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB19_2 @@ -1064,8 +1064,8 @@ define double @rint_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: rint_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI20_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI20_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB20_2 @@ -1170,8 +1170,8 @@ define double @round_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: round_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI22_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI22_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB22_2 @@ -1235,8 +1235,8 @@ define double @roundeven_f64(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: roundeven_f64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI23_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI23_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB23_2 diff --git a/llvm/test/CodeGen/RISCV/double-round-conv.ll b/llvm/test/CodeGen/RISCV/double-round-conv.ll index d84d80a4a10e9..12f025c65f36a 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv.ll @@ -1130,8 +1130,8 @@ define double @test_floor_double(double %x) { ; ; RV64IZFINXZDINX-LABEL: test_floor_double: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI40_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI40_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB40_2 @@ -1177,8 +1177,8 @@ define double @test_ceil_double(double %x) { ; ; RV64IZFINXZDINX-LABEL: test_ceil_double: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI41_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI41_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB41_2 @@ -1224,8 +1224,8 @@ define double @test_trunc_double(double %x) { ; ; RV64IZFINXZDINX-LABEL: test_trunc_double: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI42_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI42_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB42_2 @@ -1271,8 +1271,8 @@ define double @test_round_double(double %x) { ; ; RV64IZFINXZDINX-LABEL: test_round_double: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI43_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI43_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB43_2 @@ -1318,8 +1318,8 @@ define double @test_roundeven_double(double %x) { ; ; RV64IZFINXZDINX-LABEL: test_roundeven_double: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI44_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI44_0)(a1) +; RV64IZFINXZDINX-NEXT: li a1, 1075 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 52 ; RV64IZFINXZDINX-NEXT: fabs.d a2, a0 ; RV64IZFINXZDINX-NEXT: flt.d a1, a2, a1 ; RV64IZFINXZDINX-NEXT: beqz a1, .LBB44_2 diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 805ddee4ac3f6..031976b4fa2b2 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -682,8 +682,8 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IZFINX-NEXT: # %bb.1: # %start ; RV32IZFINX-NEXT: mv a2, a1 ; RV32IZFINX-NEXT: .LBB12_2: # %start -; RV32IZFINX-NEXT: lui a1, %hi(.LCPI12_0) -; RV32IZFINX-NEXT: lw a1, %lo(.LCPI12_0)(a1) +; RV32IZFINX-NEXT: lui a1, 389120 +; RV32IZFINX-NEXT: addi a1, a1, -1 ; RV32IZFINX-NEXT: flt.s a3, a1, s0 ; RV32IZFINX-NEXT: beqz a3, .LBB12_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -910,9 +910,9 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI14_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI14_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1445,11 +1445,11 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV32IZFINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZFINX: # %bb.0: # %start ; RV32IZFINX-NEXT: feq.s a1, a0, a0 -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI24_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI24_0)(a2) ; RV32IZFINX-NEXT: neg a1, a1 -; RV32IZFINX-NEXT: lui a3, 815104 -; RV32IZFINX-NEXT: fmax.s a0, a0, a3 +; RV32IZFINX-NEXT: lui a2, 815104 +; RV32IZFINX-NEXT: fmax.s a0, a0, a2 +; RV32IZFINX-NEXT: lui a2, 290816 +; RV32IZFINX-NEXT: addi a2, a2, -512 ; RV32IZFINX-NEXT: fmin.s a0, a0, a2 ; RV32IZFINX-NEXT: fcvt.w.s a0, a0, rtz ; RV32IZFINX-NEXT: and a0, a1, a0 @@ -1458,11 +1458,11 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV64IZFINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINX: # %bb.0: # %start ; RV64IZFINX-NEXT: feq.s a1, a0, a0 -; RV64IZFINX-NEXT: lui a2, %hi(.LCPI24_0) -; RV64IZFINX-NEXT: lw a2, %lo(.LCPI24_0)(a2) ; RV64IZFINX-NEXT: neg a1, a1 -; RV64IZFINX-NEXT: lui a3, 815104 -; RV64IZFINX-NEXT: fmax.s a0, a0, a3 +; RV64IZFINX-NEXT: lui a2, 815104 +; RV64IZFINX-NEXT: fmax.s a0, a0, a2 +; RV64IZFINX-NEXT: lui a2, 290816 +; RV64IZFINX-NEXT: addiw a2, a2, -512 ; RV64IZFINX-NEXT: fmin.s a0, a0, a2 ; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rtz ; RV64IZFINX-NEXT: and a0, a1, a0 @@ -1622,18 +1622,18 @@ define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind { ; ; RV32IZFINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZFINX: # %bb.0: # %start -; RV32IZFINX-NEXT: lui a1, %hi(.LCPI26_0) -; RV32IZFINX-NEXT: lw a1, %lo(.LCPI26_0)(a1) ; RV32IZFINX-NEXT: fmax.s a0, a0, zero +; RV32IZFINX-NEXT: lui a1, 292864 +; RV32IZFINX-NEXT: addi a1, a1, -256 ; RV32IZFINX-NEXT: fmin.s a0, a0, a1 ; RV32IZFINX-NEXT: fcvt.wu.s a0, a0, rtz ; RV32IZFINX-NEXT: ret ; ; RV64IZFINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZFINX: # %bb.0: # %start -; RV64IZFINX-NEXT: lui a1, %hi(.LCPI26_0) -; RV64IZFINX-NEXT: lw a1, %lo(.LCPI26_0)(a1) ; RV64IZFINX-NEXT: fmax.s a0, a0, zero +; RV64IZFINX-NEXT: lui a1, 292864 +; RV64IZFINX-NEXT: addiw a1, a1, -256 ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rtz ; RV64IZFINX-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll index 5e99c7eb90562..42ac20286a892 100644 --- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll @@ -112,9 +112,9 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -241,9 +241,9 @@ define i64 @test_floor_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI3_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI3_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -372,9 +372,9 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI5_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI5_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -501,9 +501,9 @@ define i64 @test_ceil_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI7_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI7_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -632,9 +632,9 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI9_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI9_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -761,9 +761,9 @@ define i64 @test_trunc_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI11_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI11_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -892,9 +892,9 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI13_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI13_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1021,9 +1021,9 @@ define i64 @test_round_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI15_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI15_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1152,9 +1152,9 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI17_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI17_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1281,9 +1281,9 @@ define i64 @test_roundeven_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI19_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI19_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1412,9 +1412,9 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s2, s1 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI21_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI21_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 +; RV32IZFINX-NEXT: lui a2, 389120 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a4, a2, s0 ; RV32IZFINX-NEXT: neg a2, a4 ; RV32IZFINX-NEXT: or a0, a2, a0 @@ -1541,9 +1541,9 @@ define i64 @test_rint_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: neg s1, a0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixunssfdi -; RV32IZFINX-NEXT: lui a2, %hi(.LCPI23_0) -; RV32IZFINX-NEXT: lw a2, %lo(.LCPI23_0)(a2) ; RV32IZFINX-NEXT: and a0, s1, a0 +; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index f8522b09970bf..b033c75eeadd8 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -2910,35 +2910,18 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; RV64IZFHMIN-NEXT: ret ; -; RV32IZHINXMIN-LABEL: fsgnjx_f16: -; RV32IZHINXMIN: # %bb.0: -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI23_0) -; RV32IZHINXMIN-NEXT: lh a2, %lo(.LCPI23_0)(a2) -; RV32IZHINXMIN-NEXT: lui a3, 1048568 -; RV32IZHINXMIN-NEXT: and a0, a0, a3 -; RV32IZHINXMIN-NEXT: slli a2, a2, 17 -; RV32IZHINXMIN-NEXT: srli a2, a2, 17 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV32IZHINXMIN-NEXT: fmul.s a0, a0, a1 -; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 -; RV32IZHINXMIN-NEXT: ret -; -; RV64IZHINXMIN-LABEL: fsgnjx_f16: -; RV64IZHINXMIN: # %bb.0: -; RV64IZHINXMIN-NEXT: lui a2, %hi(.LCPI23_0) -; RV64IZHINXMIN-NEXT: lh a2, %lo(.LCPI23_0)(a2) -; RV64IZHINXMIN-NEXT: lui a3, 1048568 -; RV64IZHINXMIN-NEXT: and a0, a0, a3 -; RV64IZHINXMIN-NEXT: slli a2, a2, 49 -; RV64IZHINXMIN-NEXT: srli a2, a2, 49 -; RV64IZHINXMIN-NEXT: or a0, a2, a0 -; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV64IZHINXMIN-NEXT: fmul.s a0, a0, a1 -; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 -; RV64IZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fsgnjx_f16: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: lui a2, 1048568 +; CHECKIZHINXMIN-NEXT: and a0, a0, a2 +; CHECKIZHINXMIN-NEXT: li a2, 15 +; CHECKIZHINXMIN-NEXT: slli a2, a2, 10 +; CHECKIZHINXMIN-NEXT: or a0, a0, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %z = call half @llvm.copysign.f16(half 1.0, half %x) %mul = fmul half %z, %y ret half %mul diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index bc1a652061020..32f7dfaee8837 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -255,11 +255,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZHINX-NEXT: neg a1, a1 -; RV32IZHINX-NEXT: lui a3, 815104 -; RV32IZHINX-NEXT: fmax.s a0, a0, a3 +; RV32IZHINX-NEXT: lui a2, 815104 +; RV32IZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZHINX-NEXT: lui a2, 290816 +; RV32IZHINX-NEXT: addi a2, a2, -512 ; RV32IZHINX-NEXT: fmin.s a0, a0, a2 ; RV32IZHINX-NEXT: fcvt.w.s a0, a0, rtz ; RV32IZHINX-NEXT: and a0, a1, a0 @@ -269,11 +269,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZHINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV64IZHINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV64IZHINX-NEXT: neg a1, a1 -; RV64IZHINX-NEXT: lui a3, 815104 -; RV64IZHINX-NEXT: fmax.s a0, a0, a3 +; RV64IZHINX-NEXT: lui a2, 815104 +; RV64IZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZHINX-NEXT: lui a2, 290816 +; RV64IZHINX-NEXT: addiw a2, a2, -512 ; RV64IZHINX-NEXT: fmin.s a0, a0, a2 ; RV64IZHINX-NEXT: fcvt.l.s a0, a0, rtz ; RV64IZHINX-NEXT: and a0, a1, a0 @@ -283,11 +283,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZDINXZHINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZDINXZHINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZDINXZHINX-NEXT: neg a1, a1 -; RV32IZDINXZHINX-NEXT: lui a3, 815104 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a3 +; RV32IZDINXZHINX-NEXT: lui a2, 815104 +; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZDINXZHINX-NEXT: lui a2, 290816 +; RV32IZDINXZHINX-NEXT: addi a2, a2, -512 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a2 ; RV32IZDINXZHINX-NEXT: fcvt.w.s a0, a0, rtz ; RV32IZDINXZHINX-NEXT: and a0, a1, a0 @@ -297,11 +297,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZDINXZHINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV64IZDINXZHINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV64IZDINXZHINX-NEXT: neg a1, a1 -; RV64IZDINXZHINX-NEXT: lui a3, 815104 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a3 +; RV64IZDINXZHINX-NEXT: lui a2, 815104 +; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZDINXZHINX-NEXT: lui a2, 290816 +; RV64IZDINXZHINX-NEXT: addiw a2, a2, -512 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a2 ; RV64IZDINXZHINX-NEXT: fcvt.l.s a0, a0, rtz ; RV64IZDINXZHINX-NEXT: and a0, a1, a0 @@ -505,11 +505,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) -; CHECK32-IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; CHECK32-IZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZHINXMIN-NEXT: lui a3, 815104 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK32-IZHINXMIN-NEXT: lui a2, 815104 +; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZHINXMIN-NEXT: addi a2, a2, -512 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz ; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 @@ -519,11 +519,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) -; CHECK64-IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; CHECK64-IZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZHINXMIN-NEXT: lui a3, 815104 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK64-IZHINXMIN-NEXT: lui a2, 815104 +; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZHINXMIN-NEXT: addiw a2, a2, -512 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz ; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 @@ -533,11 +533,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a3, 815104 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 815104 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a2, -512 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz ; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 @@ -547,11 +547,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) -; CHECK64-IZDINXZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a3, 815104 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 815104 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZDINXZHINXMIN-NEXT: addiw a2, a2, -512 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz ; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 @@ -755,40 +755,40 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; ; RV32IZHINX-LABEL: fcvt_ui_h_sat: ; RV32IZHINX: # %bb.0: # %start -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV32IZHINX-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: fmax.s a0, a0, zero +; RV32IZHINX-NEXT: lui a1, 292864 +; RV32IZHINX-NEXT: addi a1, a1, -256 ; RV32IZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZHINX-NEXT: fcvt.wu.s a0, a0, rtz ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_ui_h_sat: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV64IZHINX-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: fmax.s a0, a0, zero +; RV64IZHINX-NEXT: lui a1, 292864 +; RV64IZHINX-NEXT: addiw a1, a1, -256 ; RV64IZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZHINX-NEXT: fcvt.lu.s a0, a0, rtz ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_ui_h_sat: ; RV32IZDINXZHINX: # %bb.0: # %start -; RV32IZDINXZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV32IZDINXZHINX-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero +; RV32IZDINXZHINX-NEXT: lui a1, 292864 +; RV32IZDINXZHINX-NEXT: addi a1, a1, -256 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZDINXZHINX-NEXT: fcvt.wu.s a0, a0, rtz ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_ui_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV64IZDINXZHINX-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero +; RV64IZDINXZHINX-NEXT: lui a1, 292864 +; RV64IZDINXZHINX-NEXT: addiw a1, a1, -256 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZDINXZHINX-NEXT: fcvt.lu.s a0, a0, rtz ; RV64IZDINXZHINX-NEXT: ret @@ -955,40 +955,40 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; ; CHECK32-IZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start -; CHECK32-IZHINXMIN-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK32-IZHINXMIN-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK32-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start -; CHECK64-IZHINXMIN-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK64-IZHINXMIN-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK64-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start -; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start -; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK64-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI3_0)(a1) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZDINXZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz ; CHECK64-IZDINXZHINXMIN-NEXT: ret @@ -2248,10 +2248,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZHINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZHINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32IZHINX-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IZHINX-NEXT: lw a1, %lo(.LCPI10_0)(a1) ; RV32IZHINX-NEXT: fcvt.s.h s0, a0 -; RV32IZHINX-NEXT: flt.s s1, a1, s0 +; RV32IZHINX-NEXT: lui a0, 389120 +; RV32IZHINX-NEXT: addi a0, a0, -1 +; RV32IZHINX-NEXT: flt.s s1, a0, s0 ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: lui a0, 913408 ; RV32IZHINX-NEXT: fle.s s3, a0, s0 @@ -2301,10 +2301,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZDINXZHINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32IZDINXZHINX-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32IZDINXZHINX-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IZDINXZHINX-NEXT: lw a1, %lo(.LCPI10_0)(a1) ; RV32IZDINXZHINX-NEXT: fcvt.s.h s0, a0 -; RV32IZDINXZHINX-NEXT: flt.s s1, a1, s0 +; RV32IZDINXZHINX-NEXT: lui a0, 389120 +; RV32IZDINXZHINX-NEXT: addi a0, a0, -1 +; RV32IZDINXZHINX-NEXT: flt.s s1, a0, s0 ; RV32IZDINXZHINX-NEXT: neg s2, s1 ; RV32IZDINXZHINX-NEXT: lui a0, 913408 ; RV32IZDINXZHINX-NEXT: fle.s s3, a0, s0 @@ -2651,10 +2651,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; CHECK32-IZHINXMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; CHECK32-IZHINXMIN-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; CHECK32-IZHINXMIN-NEXT: lui a1, %hi(.LCPI10_0) -; CHECK32-IZHINXMIN-NEXT: lw a1, %lo(.LCPI10_0)(a1) ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h s0, a0 -; CHECK32-IZHINXMIN-NEXT: flt.s s1, a1, s0 +; CHECK32-IZHINXMIN-NEXT: lui a0, 389120 +; CHECK32-IZHINXMIN-NEXT: addi a0, a0, -1 +; CHECK32-IZHINXMIN-NEXT: flt.s s1, a0, s0 ; CHECK32-IZHINXMIN-NEXT: neg s2, s1 ; CHECK32-IZHINXMIN-NEXT: lui a0, 913408 ; CHECK32-IZHINXMIN-NEXT: fle.s s3, a0, s0 @@ -2705,10 +2705,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; CHECK32-IZDINXZHINXMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; CHECK32-IZDINXZHINXMIN-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI10_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI10_0)(a1) ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h s0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: flt.s s1, a1, s0 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a0, 389120 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 +; CHECK32-IZDINXZHINXMIN-NEXT: flt.s s1, a0, s0 ; CHECK32-IZDINXZHINXMIN-NEXT: neg s2, s1 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a0, 913408 ; CHECK32-IZDINXZHINXMIN-NEXT: fle.s s3, a0, s0 @@ -3000,9 +3000,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZHINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZHINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZHINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI12_0) -; RV32IZHINX-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 +; RV32IZHINX-NEXT: lui a1, 391168 +; RV32IZHINX-NEXT: addi a1, a1, -1 ; RV32IZHINX-NEXT: flt.s a1, a1, a0 ; RV32IZHINX-NEXT: neg s0, a1 ; RV32IZHINX-NEXT: fle.s a1, zero, a0 @@ -3033,9 +3033,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZDINXZHINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZDINXZHINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32IZDINXZHINX-NEXT: lui a1, %hi(.LCPI12_0) -; RV32IZDINXZHINX-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 +; RV32IZDINXZHINX-NEXT: lui a1, 391168 +; RV32IZDINXZHINX-NEXT: addi a1, a1, -1 ; RV32IZDINXZHINX-NEXT: flt.s a1, a1, a0 ; RV32IZDINXZHINX-NEXT: neg s0, a1 ; RV32IZDINXZHINX-NEXT: fle.s a1, zero, a0 @@ -3245,9 +3245,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; CHECK32-IZHINXMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; CHECK32-IZHINXMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; CHECK32-IZHINXMIN-NEXT: lui a1, %hi(.LCPI12_0) -; CHECK32-IZHINXMIN-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECK32-IZHINXMIN-NEXT: lui a1, 391168 +; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -1 ; CHECK32-IZHINXMIN-NEXT: flt.s a1, a1, a0 ; CHECK32-IZHINXMIN-NEXT: neg s0, a1 ; CHECK32-IZHINXMIN-NEXT: fle.s a1, zero, a0 @@ -3279,9 +3279,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; CHECK32-IZDINXZHINXMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; CHECK32-IZDINXZHINXMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI12_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 391168 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -1 ; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a1, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: neg s0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: fle.s a1, zero, a0 @@ -6373,11 +6373,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI32_0) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; RV32IZHINX-NEXT: neg a1, a1 -; RV32IZHINX-NEXT: lui a3, 815104 -; RV32IZHINX-NEXT: fmax.s a0, a0, a3 +; RV32IZHINX-NEXT: lui a2, 815104 +; RV32IZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZHINX-NEXT: lui a2, 290816 +; RV32IZHINX-NEXT: addi a2, a2, -512 ; RV32IZHINX-NEXT: fmin.s a0, a0, a2 ; RV32IZHINX-NEXT: fcvt.w.s a0, a0, rtz ; RV32IZHINX-NEXT: and a0, a1, a0 @@ -6387,11 +6387,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZHINX-NEXT: lui a2, %hi(.LCPI32_0) -; RV64IZHINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; RV64IZHINX-NEXT: neg a1, a1 -; RV64IZHINX-NEXT: lui a3, 815104 -; RV64IZHINX-NEXT: fmax.s a0, a0, a3 +; RV64IZHINX-NEXT: lui a2, 815104 +; RV64IZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZHINX-NEXT: lui a2, 290816 +; RV64IZHINX-NEXT: addiw a2, a2, -512 ; RV64IZHINX-NEXT: fmin.s a0, a0, a2 ; RV64IZHINX-NEXT: fcvt.l.s a0, a0, rtz ; RV64IZHINX-NEXT: and a0, a1, a0 @@ -6401,11 +6401,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZDINXZHINX-NEXT: lui a2, %hi(.LCPI32_0) -; RV32IZDINXZHINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; RV32IZDINXZHINX-NEXT: neg a1, a1 -; RV32IZDINXZHINX-NEXT: lui a3, 815104 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a3 +; RV32IZDINXZHINX-NEXT: lui a2, 815104 +; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZDINXZHINX-NEXT: lui a2, 290816 +; RV32IZDINXZHINX-NEXT: addi a2, a2, -512 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a2 ; RV32IZDINXZHINX-NEXT: fcvt.w.s a0, a0, rtz ; RV32IZDINXZHINX-NEXT: and a0, a1, a0 @@ -6415,11 +6415,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZDINXZHINX-NEXT: lui a2, %hi(.LCPI32_0) -; RV64IZDINXZHINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; RV64IZDINXZHINX-NEXT: neg a1, a1 -; RV64IZDINXZHINX-NEXT: lui a3, 815104 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a3 +; RV64IZDINXZHINX-NEXT: lui a2, 815104 +; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZDINXZHINX-NEXT: lui a2, 290816 +; RV64IZDINXZHINX-NEXT: addiw a2, a2, -512 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a2 ; RV64IZDINXZHINX-NEXT: fcvt.l.s a0, a0, rtz ; RV64IZDINXZHINX-NEXT: and a0, a1, a0 @@ -6627,11 +6627,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZHINXMIN-NEXT: lui a2, %hi(.LCPI32_0) -; CHECK32-IZHINXMIN-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; CHECK32-IZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZHINXMIN-NEXT: lui a3, 815104 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK32-IZHINXMIN-NEXT: lui a2, 815104 +; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZHINXMIN-NEXT: addi a2, a2, -512 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz ; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 @@ -6641,11 +6641,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZHINXMIN-NEXT: lui a2, %hi(.LCPI32_0) -; CHECK64-IZHINXMIN-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; CHECK64-IZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZHINXMIN-NEXT: lui a3, 815104 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK64-IZHINXMIN-NEXT: lui a2, 815104 +; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZHINXMIN-NEXT: addiw a2, a2, -512 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz ; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 @@ -6655,11 +6655,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, %hi(.LCPI32_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a3, 815104 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 815104 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a2, -512 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz ; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 @@ -6669,11 +6669,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, %hi(.LCPI32_0) -; CHECK64-IZDINXZHINXMIN-NEXT: lw a2, %lo(.LCPI32_0)(a2) ; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a3, 815104 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a3 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 815104 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZDINXZHINXMIN-NEXT: addiw a2, a2, -512 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz ; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 @@ -6876,40 +6876,40 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; ; RV32IZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZHINX: # %bb.0: # %start -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI34_0) -; RV32IZHINX-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: fmax.s a0, a0, zero +; RV32IZHINX-NEXT: lui a1, 292864 +; RV32IZHINX-NEXT: addi a1, a1, -256 ; RV32IZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZHINX-NEXT: fcvt.wu.s a0, a0, rtz ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI34_0) -; RV64IZHINX-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: fmax.s a0, a0, zero +; RV64IZHINX-NEXT: lui a1, 292864 +; RV64IZHINX-NEXT: addiw a1, a1, -256 ; RV64IZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZHINX-NEXT: fcvt.lu.s a0, a0, rtz ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZDINXZHINX: # %bb.0: # %start -; RV32IZDINXZHINX-NEXT: lui a1, %hi(.LCPI34_0) -; RV32IZDINXZHINX-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero +; RV32IZDINXZHINX-NEXT: lui a1, 292864 +; RV32IZDINXZHINX-NEXT: addi a1, a1, -256 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZDINXZHINX-NEXT: fcvt.wu.s a0, a0, rtz ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: lui a1, %hi(.LCPI34_0) -; RV64IZDINXZHINX-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero +; RV64IZDINXZHINX-NEXT: lui a1, 292864 +; RV64IZDINXZHINX-NEXT: addiw a1, a1, -256 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZDINXZHINX-NEXT: fcvt.lu.s a0, a0, rtz ; RV64IZDINXZHINX-NEXT: ret @@ -7082,40 +7082,40 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZHINXMIN: # %bb.0: # %start -; CHECK32-IZHINXMIN-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK32-IZHINXMIN-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK32-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZHINXMIN: # %bb.0: # %start -; CHECK64-IZHINXMIN-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK64-IZHINXMIN-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK64-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start -; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start -; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK64-IZDINXZHINXMIN-NEXT: lw a1, %lo(.LCPI34_0)(a1) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero +; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZDINXZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz ; CHECK64-IZDINXZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll index 4c39885176f01..2ebc28c2ebd44 100644 --- a/llvm/test/CodeGen/RISCV/half-imm.ll +++ b/llvm/test/CodeGen/RISCV/half-imm.ll @@ -70,15 +70,15 @@ define half @half_imm_op(half %a) nounwind { ; ; RV32IZHINX-LABEL: half_imm_op: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV32IZHINX-NEXT: li a1, 15 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fadd.h a0, a0, a1 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: half_imm_op: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV64IZHINX-NEXT: li a1, 15 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fadd.h a0, a0, a1 ; RV64IZHINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index 40363b321848d..3e0f838270aa5 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -2132,8 +2132,8 @@ define half @floor_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: floor_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI17_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI17_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB17_2 @@ -2223,8 +2223,8 @@ define half @ceil_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: ceil_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI18_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI18_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB18_2 @@ -2314,8 +2314,8 @@ define half @trunc_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: trunc_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI19_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI19_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB19_2 @@ -2405,8 +2405,8 @@ define half @rint_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: rint_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI20_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI20_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB20_2 @@ -2616,8 +2616,8 @@ define half @round_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: round_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI22_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI22_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB22_2 @@ -2707,8 +2707,8 @@ define half @roundeven_f16(half %a) nounwind { ; ; CHECKIZHINX-LABEL: roundeven_f16: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI23_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI23_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB23_2 diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll index 04a8a66f44598..0b93c8789fca5 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll @@ -28,8 +28,8 @@ define signext i32 @test_floor_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_floor_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI0_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI0_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB0_2 @@ -153,8 +153,8 @@ define i64 @test_floor_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_floor_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB1_2 @@ -174,9 +174,9 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI1_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI1_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -203,8 +203,8 @@ define i64 @test_floor_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_floor_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB1_2 @@ -317,9 +317,9 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -381,8 +381,8 @@ define signext i32 @test_floor_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_floor_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI2_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI2_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB2_2 @@ -400,8 +400,8 @@ define signext i32 @test_floor_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_floor_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI2_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI2_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB2_2 @@ -555,8 +555,8 @@ define i64 @test_floor_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_floor_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI3_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB3_2 @@ -574,9 +574,9 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI3_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI3_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -590,8 +590,8 @@ define i64 @test_floor_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_floor_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI3_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB3_2 @@ -689,9 +689,9 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI3_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI3_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -740,8 +740,8 @@ define signext i32 @test_ceil_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_ceil_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI4_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI4_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB4_2 @@ -865,8 +865,8 @@ define i64 @test_ceil_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_ceil_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI5_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI5_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB5_2 @@ -886,9 +886,9 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI5_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI5_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -915,8 +915,8 @@ define i64 @test_ceil_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_ceil_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI5_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI5_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB5_2 @@ -1029,9 +1029,9 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI5_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI5_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -1093,8 +1093,8 @@ define signext i32 @test_ceil_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI6_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI6_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB6_2 @@ -1112,8 +1112,8 @@ define signext i32 @test_ceil_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI6_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI6_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB6_2 @@ -1267,8 +1267,8 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_ceil_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI7_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI7_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB7_2 @@ -1286,9 +1286,9 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI7_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI7_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -1302,8 +1302,8 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_ceil_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI7_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI7_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB7_2 @@ -1401,9 +1401,9 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI7_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI7_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -1452,8 +1452,8 @@ define signext i32 @test_trunc_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_trunc_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI8_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI8_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB8_2 @@ -1577,8 +1577,8 @@ define i64 @test_trunc_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_trunc_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI9_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI9_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB9_2 @@ -1598,9 +1598,9 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI9_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI9_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -1627,8 +1627,8 @@ define i64 @test_trunc_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_trunc_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI9_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI9_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB9_2 @@ -1741,9 +1741,9 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI9_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI9_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -1805,8 +1805,8 @@ define signext i32 @test_trunc_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI10_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB10_2 @@ -1824,8 +1824,8 @@ define signext i32 @test_trunc_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI10_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI10_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB10_2 @@ -1979,8 +1979,8 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_trunc_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI11_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI11_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB11_2 @@ -1998,9 +1998,9 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI11_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI11_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -2014,8 +2014,8 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_trunc_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI11_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI11_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB11_2 @@ -2113,9 +2113,9 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI11_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI11_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -2164,8 +2164,8 @@ define signext i32 @test_round_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_round_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI12_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI12_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB12_2 @@ -2289,8 +2289,8 @@ define i64 @test_round_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_round_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI13_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI13_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB13_2 @@ -2310,9 +2310,9 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI13_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI13_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -2339,8 +2339,8 @@ define i64 @test_round_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_round_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI13_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI13_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB13_2 @@ -2453,9 +2453,9 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI13_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI13_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -2517,8 +2517,8 @@ define signext i32 @test_round_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_round_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI14_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI14_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB14_2 @@ -2536,8 +2536,8 @@ define signext i32 @test_round_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_round_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI14_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI14_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB14_2 @@ -2691,8 +2691,8 @@ define i64 @test_round_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_round_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI15_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI15_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB15_2 @@ -2710,9 +2710,9 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI15_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI15_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -2726,8 +2726,8 @@ define i64 @test_round_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_round_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI15_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI15_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB15_2 @@ -2825,9 +2825,9 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI15_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI15_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -2876,8 +2876,8 @@ define signext i32 @test_roundeven_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_roundeven_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI16_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI16_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB16_2 @@ -3001,8 +3001,8 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_roundeven_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI17_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI17_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB17_2 @@ -3022,9 +3022,9 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI17_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI17_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -3051,8 +3051,8 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_roundeven_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI17_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI17_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB17_2 @@ -3165,9 +3165,9 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI17_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI17_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -3229,8 +3229,8 @@ define signext i32 @test_roundeven_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI18_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI18_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB18_2 @@ -3248,8 +3248,8 @@ define signext i32 @test_roundeven_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI18_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI18_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB18_2 @@ -3403,8 +3403,8 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_roundeven_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI19_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI19_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB19_2 @@ -3422,9 +3422,9 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI19_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI19_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -3438,8 +3438,8 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_roundeven_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI19_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI19_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB19_2 @@ -3537,9 +3537,9 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI19_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI19_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -3588,8 +3588,8 @@ define signext i32 @test_rint_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_rint_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI20_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI20_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB20_2 @@ -3713,8 +3713,8 @@ define i64 @test_rint_si64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_rint_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI21_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI21_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB21_2 @@ -3734,9 +3734,9 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s2, s1 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI21_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI21_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 +; RV32IZHINX-NEXT: lui a2, 389120 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a4, a2, s0 ; RV32IZHINX-NEXT: neg a2, a4 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -3763,8 +3763,8 @@ define i64 @test_rint_si64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_rint_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI21_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI21_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB21_2 @@ -3877,9 +3877,9 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s2, s1 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI21_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI21_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 +; RV32IZHINXMIN-NEXT: lui a2, 389120 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a4 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 @@ -3941,8 +3941,8 @@ define signext i32 @test_rint_ui32(half %x) { ; ; RV32IZHINX-LABEL: test_rint_ui32: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI22_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI22_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB22_2 @@ -3960,8 +3960,8 @@ define signext i32 @test_rint_ui32(half %x) { ; ; RV64IZHINX-LABEL: test_rint_ui32: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI22_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI22_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB22_2 @@ -4115,8 +4115,8 @@ define i64 @test_rint_ui64(half %x) nounwind { ; ; RV32IZHINX-LABEL: test_rint_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI23_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI23_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB23_2 @@ -4134,9 +4134,9 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: neg s1, a0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixunssfdi -; RV32IZHINX-NEXT: lui a2, %hi(.LCPI23_1) -; RV32IZHINX-NEXT: lw a2, %lo(.LCPI23_1)(a2) ; RV32IZHINX-NEXT: and a0, s1, a0 +; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 @@ -4150,8 +4150,8 @@ define i64 @test_rint_ui64(half %x) nounwind { ; ; RV64IZHINX-LABEL: test_rint_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI23_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI23_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB23_2 @@ -4249,9 +4249,9 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: neg s1, a0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixunssfdi -; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI23_0) -; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI23_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s1, a0 +; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/half-round-conv.ll b/llvm/test/CodeGen/RISCV/half-round-conv.ll index 2a1e0cfdda83e..8eeea07426575 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv.ll @@ -29,8 +29,8 @@ define signext i8 @test_floor_si8(half %x) { ; ; RV32IZHINX-LABEL: test_floor_si8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI0_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI0_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB0_2 @@ -44,8 +44,8 @@ define signext i8 @test_floor_si8(half %x) { ; ; RV64IZHINX-LABEL: test_floor_si8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI0_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI0_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB0_2 @@ -144,8 +144,8 @@ define signext i16 @test_floor_si16(half %x) { ; ; RV32IZHINX-LABEL: test_floor_si16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB1_2 @@ -159,8 +159,8 @@ define signext i16 @test_floor_si16(half %x) { ; ; RV64IZHINX-LABEL: test_floor_si16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI1_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI1_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB1_2 @@ -254,8 +254,8 @@ define signext i32 @test_floor_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_floor_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI2_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI2_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB2_2 @@ -335,8 +335,8 @@ define i64 @test_floor_si64(half %x) { ; ; RV32IZHINX-LABEL: test_floor_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI3_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB3_2 @@ -356,8 +356,8 @@ define i64 @test_floor_si64(half %x) { ; ; RV64IZHINX-LABEL: test_floor_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI3_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI3_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB3_2 @@ -466,8 +466,8 @@ define zeroext i8 @test_floor_ui8(half %x) { ; ; RV32IZHINX-LABEL: test_floor_ui8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI4_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI4_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB4_2 @@ -481,8 +481,8 @@ define zeroext i8 @test_floor_ui8(half %x) { ; ; RV64IZHINX-LABEL: test_floor_ui8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI4_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI4_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB4_2 @@ -581,8 +581,8 @@ define zeroext i16 @test_floor_ui16(half %x) { ; ; RV32IZHINX-LABEL: test_floor_ui16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI5_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI5_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB5_2 @@ -596,8 +596,8 @@ define zeroext i16 @test_floor_ui16(half %x) { ; ; RV64IZHINX-LABEL: test_floor_ui16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI5_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI5_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB5_2 @@ -691,8 +691,8 @@ define signext i32 @test_floor_ui32(half %x) { ; ; CHECKIZHINX-LABEL: test_floor_ui32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI6_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI6_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB6_2 @@ -772,8 +772,8 @@ define i64 @test_floor_ui64(half %x) { ; ; RV32IZHINX-LABEL: test_floor_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI7_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI7_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB7_2 @@ -793,8 +793,8 @@ define i64 @test_floor_ui64(half %x) { ; ; RV64IZHINX-LABEL: test_floor_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI7_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI7_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB7_2 @@ -903,8 +903,8 @@ define signext i8 @test_ceil_si8(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_si8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI8_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI8_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB8_2 @@ -918,8 +918,8 @@ define signext i8 @test_ceil_si8(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_si8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI8_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI8_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB8_2 @@ -1018,8 +1018,8 @@ define signext i16 @test_ceil_si16(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_si16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI9_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI9_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB9_2 @@ -1033,8 +1033,8 @@ define signext i16 @test_ceil_si16(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_si16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI9_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI9_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB9_2 @@ -1128,8 +1128,8 @@ define signext i32 @test_ceil_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_ceil_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI10_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI10_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB10_2 @@ -1209,8 +1209,8 @@ define i64 @test_ceil_si64(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI11_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI11_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB11_2 @@ -1230,8 +1230,8 @@ define i64 @test_ceil_si64(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI11_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI11_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB11_2 @@ -1340,8 +1340,8 @@ define zeroext i8 @test_ceil_ui8(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_ui8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI12_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI12_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB12_2 @@ -1355,8 +1355,8 @@ define zeroext i8 @test_ceil_ui8(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_ui8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI12_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI12_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB12_2 @@ -1455,8 +1455,8 @@ define zeroext i16 @test_ceil_ui16(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_ui16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI13_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI13_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB13_2 @@ -1470,8 +1470,8 @@ define zeroext i16 @test_ceil_ui16(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_ui16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI13_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI13_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB13_2 @@ -1565,8 +1565,8 @@ define signext i32 @test_ceil_ui32(half %x) { ; ; CHECKIZHINX-LABEL: test_ceil_ui32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI14_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI14_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB14_2 @@ -1646,8 +1646,8 @@ define i64 @test_ceil_ui64(half %x) { ; ; RV32IZHINX-LABEL: test_ceil_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI15_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI15_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB15_2 @@ -1667,8 +1667,8 @@ define i64 @test_ceil_ui64(half %x) { ; ; RV64IZHINX-LABEL: test_ceil_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI15_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI15_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB15_2 @@ -1777,8 +1777,8 @@ define signext i8 @test_trunc_si8(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_si8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI16_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI16_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB16_2 @@ -1792,8 +1792,8 @@ define signext i8 @test_trunc_si8(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_si8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI16_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI16_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB16_2 @@ -1892,8 +1892,8 @@ define signext i16 @test_trunc_si16(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_si16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI17_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI17_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB17_2 @@ -1907,8 +1907,8 @@ define signext i16 @test_trunc_si16(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_si16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI17_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI17_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB17_2 @@ -2002,8 +2002,8 @@ define signext i32 @test_trunc_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_trunc_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI18_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI18_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB18_2 @@ -2083,8 +2083,8 @@ define i64 @test_trunc_si64(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI19_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI19_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB19_2 @@ -2104,8 +2104,8 @@ define i64 @test_trunc_si64(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI19_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI19_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB19_2 @@ -2214,8 +2214,8 @@ define zeroext i8 @test_trunc_ui8(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_ui8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI20_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI20_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB20_2 @@ -2229,8 +2229,8 @@ define zeroext i8 @test_trunc_ui8(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_ui8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI20_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI20_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB20_2 @@ -2329,8 +2329,8 @@ define zeroext i16 @test_trunc_ui16(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_ui16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI21_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI21_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB21_2 @@ -2344,8 +2344,8 @@ define zeroext i16 @test_trunc_ui16(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_ui16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI21_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI21_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB21_2 @@ -2439,8 +2439,8 @@ define signext i32 @test_trunc_ui32(half %x) { ; ; CHECKIZHINX-LABEL: test_trunc_ui32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI22_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI22_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB22_2 @@ -2520,8 +2520,8 @@ define i64 @test_trunc_ui64(half %x) { ; ; RV32IZHINX-LABEL: test_trunc_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI23_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI23_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB23_2 @@ -2541,8 +2541,8 @@ define i64 @test_trunc_ui64(half %x) { ; ; RV64IZHINX-LABEL: test_trunc_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI23_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI23_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB23_2 @@ -2651,8 +2651,8 @@ define signext i8 @test_round_si8(half %x) { ; ; RV32IZHINX-LABEL: test_round_si8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI24_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI24_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB24_2 @@ -2666,8 +2666,8 @@ define signext i8 @test_round_si8(half %x) { ; ; RV64IZHINX-LABEL: test_round_si8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI24_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI24_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB24_2 @@ -2766,8 +2766,8 @@ define signext i16 @test_round_si16(half %x) { ; ; RV32IZHINX-LABEL: test_round_si16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI25_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI25_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB25_2 @@ -2781,8 +2781,8 @@ define signext i16 @test_round_si16(half %x) { ; ; RV64IZHINX-LABEL: test_round_si16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI25_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI25_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB25_2 @@ -2876,8 +2876,8 @@ define signext i32 @test_round_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_round_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI26_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI26_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB26_2 @@ -2957,8 +2957,8 @@ define i64 @test_round_si64(half %x) { ; ; RV32IZHINX-LABEL: test_round_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI27_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI27_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB27_2 @@ -2978,8 +2978,8 @@ define i64 @test_round_si64(half %x) { ; ; RV64IZHINX-LABEL: test_round_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI27_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI27_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB27_2 @@ -3088,8 +3088,8 @@ define zeroext i8 @test_round_ui8(half %x) { ; ; RV32IZHINX-LABEL: test_round_ui8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI28_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI28_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB28_2 @@ -3103,8 +3103,8 @@ define zeroext i8 @test_round_ui8(half %x) { ; ; RV64IZHINX-LABEL: test_round_ui8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI28_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI28_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB28_2 @@ -3203,8 +3203,8 @@ define zeroext i16 @test_round_ui16(half %x) { ; ; RV32IZHINX-LABEL: test_round_ui16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI29_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI29_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB29_2 @@ -3218,8 +3218,8 @@ define zeroext i16 @test_round_ui16(half %x) { ; ; RV64IZHINX-LABEL: test_round_ui16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI29_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI29_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB29_2 @@ -3313,8 +3313,8 @@ define signext i32 @test_round_ui32(half %x) { ; ; CHECKIZHINX-LABEL: test_round_ui32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI30_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI30_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB30_2 @@ -3394,8 +3394,8 @@ define i64 @test_round_ui64(half %x) { ; ; RV32IZHINX-LABEL: test_round_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI31_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI31_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB31_2 @@ -3415,8 +3415,8 @@ define i64 @test_round_ui64(half %x) { ; ; RV64IZHINX-LABEL: test_round_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI31_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI31_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB31_2 @@ -3525,8 +3525,8 @@ define signext i8 @test_roundeven_si8(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_si8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI32_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI32_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB32_2 @@ -3540,8 +3540,8 @@ define signext i8 @test_roundeven_si8(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_si8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI32_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI32_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB32_2 @@ -3640,8 +3640,8 @@ define signext i16 @test_roundeven_si16(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_si16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI33_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI33_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB33_2 @@ -3655,8 +3655,8 @@ define signext i16 @test_roundeven_si16(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_si16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI33_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI33_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB33_2 @@ -3750,8 +3750,8 @@ define signext i32 @test_roundeven_si32(half %x) { ; ; CHECKIZHINX-LABEL: test_roundeven_si32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI34_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI34_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB34_2 @@ -3831,8 +3831,8 @@ define i64 @test_roundeven_si64(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_si64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI35_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI35_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB35_2 @@ -3852,8 +3852,8 @@ define i64 @test_roundeven_si64(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_si64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI35_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI35_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB35_2 @@ -3962,8 +3962,8 @@ define zeroext i8 @test_roundeven_ui8(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_ui8: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI36_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI36_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB36_2 @@ -3977,8 +3977,8 @@ define zeroext i8 @test_roundeven_ui8(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_ui8: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI36_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI36_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB36_2 @@ -4077,8 +4077,8 @@ define zeroext i16 @test_roundeven_ui16(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_ui16: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI37_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI37_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB37_2 @@ -4092,8 +4092,8 @@ define zeroext i16 @test_roundeven_ui16(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_ui16: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI37_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI37_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB37_2 @@ -4187,8 +4187,8 @@ define signext i32 @test_roundeven_ui32(half %x) { ; ; CHECKIZHINX-LABEL: test_roundeven_ui32: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI38_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI38_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB38_2 @@ -4268,8 +4268,8 @@ define i64 @test_roundeven_ui64(half %x) { ; ; RV32IZHINX-LABEL: test_roundeven_ui64: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a1, %hi(.LCPI39_0) -; RV32IZHINX-NEXT: lh a1, %lo(.LCPI39_0)(a1) +; RV32IZHINX-NEXT: li a1, 25 +; RV32IZHINX-NEXT: slli a1, a1, 10 ; RV32IZHINX-NEXT: fabs.h a2, a0 ; RV32IZHINX-NEXT: flt.h a1, a2, a1 ; RV32IZHINX-NEXT: beqz a1, .LBB39_2 @@ -4289,8 +4289,8 @@ define i64 @test_roundeven_ui64(half %x) { ; ; RV64IZHINX-LABEL: test_roundeven_ui64: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a1, %hi(.LCPI39_0) -; RV64IZHINX-NEXT: lh a1, %lo(.LCPI39_0)(a1) +; RV64IZHINX-NEXT: li a1, 25 +; RV64IZHINX-NEXT: slli a1, a1, 10 ; RV64IZHINX-NEXT: fabs.h a2, a0 ; RV64IZHINX-NEXT: flt.h a1, a2, a1 ; RV64IZHINX-NEXT: beqz a1, .LBB39_2 @@ -4424,8 +4424,8 @@ define half @test_floor_half(half %x) { ; ; CHECKIZHINX-LABEL: test_floor_half: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI40_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI40_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB40_2 @@ -4508,8 +4508,8 @@ define half @test_ceil_half(half %x) { ; ; CHECKIZHINX-LABEL: test_ceil_half: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI41_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI41_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB41_2 @@ -4592,8 +4592,8 @@ define half @test_trunc_half(half %x) { ; ; CHECKIZHINX-LABEL: test_trunc_half: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI42_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI42_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB42_2 @@ -4676,8 +4676,8 @@ define half @test_round_half(half %x) { ; ; CHECKIZHINX-LABEL: test_round_half: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI43_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI43_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB43_2 @@ -4760,8 +4760,8 @@ define half @test_roundeven_half(half %x) { ; ; CHECKIZHINX-LABEL: test_roundeven_half: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: lui a1, %hi(.LCPI44_0) -; CHECKIZHINX-NEXT: lh a1, %lo(.LCPI44_0)(a1) +; CHECKIZHINX-NEXT: li a1, 25 +; CHECKIZHINX-NEXT: slli a1, a1, 10 ; CHECKIZHINX-NEXT: fabs.h a2, a0 ; CHECKIZHINX-NEXT: flt.h a1, a2, a1 ; CHECKIZHINX-NEXT: beqz a1, .LBB44_2 From 56b2be4a7608770bae5db9d467f50c232c3cf19a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Sep 2024 16:46:02 +0100 Subject: [PATCH 246/425] [X86] Fold scalar_to_vector(funnel(x,y,imm)) -> funnel(scalar_to_vector(x),scalar_to_vector(y),imm) Limit this to cases where x, y are known to be extracted from a vector. Addresses poor x86 codegen on #107289 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++++++++ .../CodeGen/X86/vector-shuffle-combining.ll | 42 +++---------------- 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a4ad4a1bb1201..c91d37727b611 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57500,6 +57500,24 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG, } } break; + case ISD::FSHL: + case ISD::FSHR: + if (auto *Amt = dyn_cast(Src.getOperand(2))) { + if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) && + Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Src.hasOneUse()) { + uint64_t AmtVal = + Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits()); + SDValue SrcVec0 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0)); + SDValue SrcVec1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1)); + return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1, + DAG.getConstant(AmtVal, DL, VT)); + } + } + break; } return SDValue(); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 04262b4249256..36c4be5f1939e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3534,46 +3534,14 @@ define <4 x i32> @PR63700(i128 %0) { } define <16 x i8> @PR107289(<16 x i8> %0) { -; SSE2-LABEL: PR107289: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: shldq $8, %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: psllq $8, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR107289: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSSE3-NEXT: movq %xmm1, %rcx -; SSSE3-NEXT: shldq $8, %rax, %rcx -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: psllq $8, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR107289: -; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: shldq $8, %rax, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: psllq $8, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; SSE-LABEL: PR107289: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE-NEXT: retq ; ; AVX-LABEL: PR107289: ; AVX: # %bb.0: -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: shldq $8, %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpsllq $8, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; AVX-NEXT: retq %src = bitcast <16 x i8> %0 to i128 %shl = shl i128 %src, 8 From 2f6e4ed389a6589f340d7efab2b0c7ee22c3d086 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 5 Sep 2024 16:48:22 +0100 Subject: [PATCH 247/425] [IR] Check parameters of target extension types on construction (#107268) Since IR Types are immutable it makes sense to check them on construction instead of in the IR Verifier pass. This patch checks that some TargetExtTypes are well-formed in the sense that they have the expected number of type parameters and integer parameters. When called from LLParser it gives a diagnostic message. When called from anywhere else it just asserts that they are well-formed. --- llvm/include/llvm/IR/DerivedTypes.h | 14 +++++++ llvm/lib/AsmParser/LLParser.cpp | 7 +++- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 6 ++- llvm/lib/IR/Type.cpp | 39 +++++++++++++++---- .../Assembler/target-type-param-errors.ll | 12 ++++++ 5 files changed, 69 insertions(+), 9 deletions(-) create mode 100644 llvm/test/Assembler/target-type-param-errors.ll diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 01f76d4932780..0c8cbe1921ac9 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -32,6 +32,7 @@ namespace llvm { class Value; class APInt; class LLVMContext; +template class Expected; /// Class to represent integer types. Note that this class is also used to /// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and @@ -735,6 +736,19 @@ class TargetExtType : public Type { ArrayRef Types = std::nullopt, ArrayRef Ints = std::nullopt); + /// Return a target extension type having the specified name and optional + /// type and integer parameters, or an appropriate Error if it fails the + /// parameters check. + static Expected + getOrError(LLVMContext &Context, StringRef Name, + ArrayRef Types = std::nullopt, + ArrayRef Ints = std::nullopt); + + /// Check that a newly created target extension type has the expected number + /// of type parameters and integer parameters, returning the type itself if OK + /// or an appropriate Error if not. + static Expected checkParams(TargetExtType *TTy); + /// Return the name for this target extension type. Two distinct target /// extension types may have the same name if their type or integer parameters /// differ. diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index f41907f035125..93dc2bd241581 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -3530,7 +3530,12 @@ bool LLParser::parseTargetExtType(Type *&Result) { if (parseToken(lltok::rparen, "expected ')' in target extension type")) return true; - Result = TargetExtType::get(Context, TypeName, TypeParams, IntParams); + auto TTy = + TargetExtType::getOrError(Context, TypeName, TypeParams, IntParams); + if (auto E = TTy.takeError()) + return tokError(toString(std::move(E))); + + Result = *TTy; return false; } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 654be985a3229..1cd9ec6b8fca2 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2679,7 +2679,11 @@ Error BitcodeReader::parseTypeTableBody() { return error("Integer parameter too large"); IntParams.push_back(Record[i]); } - ResultTy = TargetExtType::get(Context, TypeName, TypeParams, IntParams); + auto TTy = + TargetExtType::getOrError(Context, TypeName, TypeParams, IntParams); + if (auto E = TTy.takeError()) + return E; + ResultTy = *TTy; TypeName.clear(); break; } diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 753aa5fd2118e..93891461dd663 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Error.h" #include "llvm/Support/TypeSize.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" @@ -792,6 +793,13 @@ TargetExtType::TargetExtType(LLVMContext &C, StringRef Name, TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name, ArrayRef Types, ArrayRef Ints) { + return cantFail(getOrError(C, Name, Types, Ints)); +} + +Expected TargetExtType::getOrError(LLVMContext &C, + StringRef Name, + ArrayRef Types, + ArrayRef Ints) { const TargetExtTypeKeyInfo::KeyTy Key(Name, Types, Ints); TargetExtType *TT; // Since we only want to allocate a fresh target type in case none is found @@ -799,8 +807,8 @@ TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name, // one for inserting the newly allocated one), here we instead lookup based on // Key and update the reference to the target type in-place to a newly // allocated one if not found. - auto Insertion = C.pImpl->TargetExtTypes.insert_as(nullptr, Key); - if (Insertion.second) { + auto [Iter, Inserted] = C.pImpl->TargetExtTypes.insert_as(nullptr, Key); + if (Inserted) { // The target type was not found. Allocate one and update TargetExtTypes // in-place. TT = (TargetExtType *)C.pImpl->Alloc.Allocate( @@ -808,12 +816,29 @@ TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name, sizeof(unsigned) * Ints.size(), alignof(TargetExtType)); new (TT) TargetExtType(C, Name, Types, Ints); - *Insertion.first = TT; - } else { - // The target type was found. Just return it. - TT = *Insertion.first; + *Iter = TT; + return checkParams(TT); } - return TT; + + // The target type was found. Just return it. + return *Iter; +} + +Expected TargetExtType::checkParams(TargetExtType *TTy) { + // Opaque types in the AArch64 name space. + if (TTy->Name == "aarch64.svcount" && + (TTy->getNumTypeParameters() != 0 || TTy->getNumIntParameters() != 0)) + return createStringError( + "target extension type aarch64.svcount should have no parameters"); + + // Opaque types in the RISC-V name space. + if (TTy->Name == "riscv.vector.tuple" && + (TTy->getNumTypeParameters() != 1 || TTy->getNumIntParameters() != 1)) + return createStringError( + "target extension type riscv.vector.tuple should have one " + "type parameter and one integer parameter"); + + return TTy; } namespace { diff --git a/llvm/test/Assembler/target-type-param-errors.ll b/llvm/test/Assembler/target-type-param-errors.ll new file mode 100644 index 0000000000000..03180811c7549 --- /dev/null +++ b/llvm/test/Assembler/target-type-param-errors.ll @@ -0,0 +1,12 @@ +; RUN: split-file %s %t +; RUN: not llvm-as < %t/aarch64-svcount.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SVCOUNT %s +; RUN: not llvm-as < %t/riscv-vector-tuple.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-RISCV-VECTOR-TUPLE %s +; Check target extension type properties are verified in the assembler. + +;--- aarch64-svcount.ll +declare target("aarch64.svcount", i32) @aarch64_svcount() +; CHECK-AARCH64-SVCOUNT: error: target extension type aarch64.svcount should have no parameters + +;--- riscv-vector-tuple.ll +declare target("riscv.vector.tuple", 99) @riscv_vector_tuple() +; CHECK-RISCV-VECTOR-TUPLE: target extension type riscv.vector.tuple should have one type parameter and one integer parameter From fc3e6a81868a0c84e405622a64756e57f020ca37 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 5 Sep 2024 19:54:32 +0400 Subject: [PATCH 248/425] DAG: Handle lowering unordered compare with inf (#100378) Try to take advantage of the nan check behavior of fcmp. x86_64 looks better, x86_32 looks worse. --- llvm/include/llvm/CodeGen/CodeGenCommonISel.h | 6 +- llvm/lib/CodeGen/CodeGenCommonISel.cpp | 8 ++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 58 +++++++++++-------- llvm/test/CodeGen/AArch64/isinf.ll | 12 ++-- llvm/test/CodeGen/PowerPC/fp-classify.ll | 28 ++++----- 5 files changed, 67 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h index 90ef890f22d1b..4c22be9450786 100644 --- a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h +++ b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h @@ -218,10 +218,14 @@ findSplitPointForStackProtector(MachineBasicBlock *BB, /// Evaluates if the specified FP class test is better performed as the inverse /// (i.e. fewer instructions should be required to lower it). An example is the /// test "inf|normal|subnormal|zero", which is an inversion of "nan". +/// /// \param Test The test as specified in 'is_fpclass' intrinsic invocation. +/// \param UseFCmp The intention is to perform the comparison using +/// floating-point compare instructions which check for nan. +/// /// \returns The inverted test, or fcNone, if inversion does not produce a /// simpler test. -FPClassTest invertFPClassTestIfSimpler(FPClassTest Test); +FPClassTest invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp); /// Assuming the instruction \p MI is going to be deleted, attempt to salvage /// debug users of \p MI by writing the effect of \p MI in a DIExpression. diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp index fe144d3c18203..d985751e2be0b 100644 --- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp +++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp @@ -173,8 +173,9 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB, return SplitPoint; } -FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) { +FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp) { FPClassTest InvertedTest = ~Test; + // Pick the direction with fewer tests // TODO: Handle more combinations of cases that can be handled together switch (static_cast(InvertedTest)) { @@ -200,6 +201,11 @@ FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) { case fcSubnormal | fcZero: case fcSubnormal | fcZero | fcNan: return InvertedTest; + case fcInf | fcNan: + // If we're trying to use fcmp, we can take advantage of the nan check + // behavior of the compare (but this is more instructions in the integer + // expansion). + return UseFCmp ? InvertedTest : fcNone; default: return fcNone; } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 01feec0c435ed..c3affabb19d37 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8675,7 +8675,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, // Degenerated cases. if (Test == fcNone) return DAG.getBoolConstant(false, DL, ResultVT, OperandVT); - if ((Test & fcAllFlags) == fcAllFlags) + if (Test == fcAllFlags) return DAG.getBoolConstant(true, DL, ResultVT, OperandVT); // PPC double double is a pair of doubles, of which the higher part determines @@ -8686,14 +8686,6 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, OperandVT = MVT::f64; } - // Some checks may be represented as inversion of simpler check, for example - // "inf|normal|subnormal|zero" => !"nan". - bool IsInverted = false; - if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test)) { - IsInverted = true; - Test = InvertedCheck; - } - // Floating-point type properties. EVT ScalarFloatVT = OperandVT.getScalarType(); const Type *FloatTy = ScalarFloatVT.getTypeForEVT(*DAG.getContext()); @@ -8705,9 +8697,16 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, if (Flags.hasNoFPExcept() && isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) { FPClassTest FPTestMask = Test; + bool IsInvertedFP = false; + + if (FPClassTest InvertedFPCheck = + invertFPClassTestIfSimpler(FPTestMask, true)) { + FPTestMask = InvertedFPCheck; + IsInvertedFP = true; + } - ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ; - ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ; + ISD::CondCode OrderedCmpOpcode = IsInvertedFP ? ISD::SETUNE : ISD::SETOEQ; + ISD::CondCode UnorderedCmpOpcode = IsInvertedFP ? ISD::SETONE : ISD::SETUEQ; // See if we can fold an | fcNan into an unordered compare. FPClassTest OrderedFPTestMask = FPTestMask & ~fcNan; @@ -8720,7 +8719,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, const bool IsOrdered = FPTestMask == OrderedFPTestMask; if (std::optional IsCmp0 = - isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction()); + isFCmpEqualZero(FPTestMask, Semantics, DAG.getMachineFunction()); IsCmp0 && (isCondCodeLegalOrCustom( *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode, OperandVT.getScalarType().getSimpleVT()))) { @@ -8732,31 +8731,35 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode); } - if (Test == fcNan && - isCondCodeLegalOrCustom(IsInverted ? ISD::SETO : ISD::SETUO, - OperandVT.getScalarType().getSimpleVT())) { + if (FPTestMask == fcNan && + isCondCodeLegalOrCustom(IsInvertedFP ? ISD::SETO : ISD::SETUO, + OperandVT.getScalarType().getSimpleVT())) return DAG.getSetCC(DL, ResultVT, Op, Op, - IsInverted ? ISD::SETO : ISD::SETUO); - } + IsInvertedFP ? ISD::SETO : ISD::SETUO); - if (Test == fcInf && - isCondCodeLegalOrCustom(IsInverted ? ISD::SETUNE : ISD::SETOEQ, + bool IsOrderedInf = FPTestMask == fcInf; + if ((FPTestMask == fcInf || FPTestMask == (fcInf | fcNan)) && + isCondCodeLegalOrCustom(IsOrderedInf ? OrderedCmpOpcode + : UnorderedCmpOpcode, OperandVT.getScalarType().getSimpleVT()) && - isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType())) { + isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType()) && + (isOperationLegal(ISD::ConstantFP, OperandVT.getScalarType()) || + (OperandVT.isVector() && + isOperationLegalOrCustom(ISD::BUILD_VECTOR, OperandVT)))) { // isinf(x) --> fabs(x) == inf SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op); SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT); return DAG.getSetCC(DL, ResultVT, Abs, Inf, - IsInverted ? ISD::SETUNE : ISD::SETOEQ); + IsOrderedInf ? OrderedCmpOpcode : UnorderedCmpOpcode); } if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) { // TODO: Could handle ordered case, but it produces worse code for // x86. Maybe handle ordered if fabs is free? - ISD::CondCode OrderedOp = IsInverted ? ISD::SETUGE : ISD::SETOLT; - ISD::CondCode UnorderedOp = IsInverted ? ISD::SETOGE : ISD::SETULT; + ISD::CondCode OrderedOp = IsInvertedFP ? ISD::SETUGE : ISD::SETOLT; + ISD::CondCode UnorderedOp = IsInvertedFP ? ISD::SETOGE : ISD::SETULT; if (isCondCodeLegalOrCustom(IsOrdered ? OrderedOp : UnorderedOp, OperandVT.getScalarType().getSimpleVT())) { @@ -8773,6 +8776,15 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, } } + // Some checks may be represented as inversion of simpler check, for example + // "inf|normal|subnormal|zero" => !"nan". + bool IsInverted = false; + + if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test, false)) { + Test = InvertedCheck; + IsInverted = true; + } + // In the general case use integer operations. unsigned BitSize = OperandVT.getScalarSizeInBits(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize); diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll index 834417b98743a..e68539bcf07d9 100644 --- a/llvm/test/CodeGen/AArch64/isinf.ll +++ b/llvm/test/CodeGen/AArch64/isinf.ll @@ -26,10 +26,10 @@ define i32 @replace_isinf_call_f16(half %x) { define i32 @replace_isinf_call_f32(float %x) { ; CHECK-LABEL: replace_isinf_call_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: mov w8, #2139095040 // =0x7f800000 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: and w9, w9, #0x7fffffff +; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %abs = tail call float @llvm.fabs.f32(float %x) @@ -42,10 +42,10 @@ define i32 @replace_isinf_call_f32(float %x) { define i32 @replace_isinf_call_f64(double %x) { ; CHECK-LABEL: replace_isinf_call_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fabs d0, d0 +; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: mov x8, #9218868437227405312 // =0x7ff0000000000000 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: fcmp d0, d1 +; CHECK-NEXT: and x9, x9, #0x7fffffffffffffff +; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %abs = tail call double @llvm.fabs.f64(double %x) diff --git a/llvm/test/CodeGen/PowerPC/fp-classify.ll b/llvm/test/CodeGen/PowerPC/fp-classify.ll index f527b3c48040e..dc9853ff2e301 100644 --- a/llvm/test/CodeGen/PowerPC/fp-classify.ll +++ b/llvm/test/CodeGen/PowerPC/fp-classify.ll @@ -7,13 +7,13 @@ define zeroext i1 @abs_isinff(float %x) { ; P8-LABEL: abs_isinff: ; P8: # %bb.0: # %entry -; P8-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; P8-NEXT: xsabsdp 0, 1 -; P8-NEXT: li 4, 1 -; P8-NEXT: lfs 1, .LCPI0_0@toc@l(3) -; P8-NEXT: li 3, 0 -; P8-NEXT: fcmpu 0, 0, 1 -; P8-NEXT: iseleq 3, 4, 3 +; P8-NEXT: xscvdpspn 0, 1 +; P8-NEXT: lis 4, 32640 +; P8-NEXT: mffprwz 3, 0 +; P8-NEXT: clrlwi 3, 3, 1 +; P8-NEXT: xor 3, 3, 4 +; P8-NEXT: cntlzw 3, 3 +; P8-NEXT: srwi 3, 3, 5 ; P8-NEXT: blr ; ; P9-LABEL: abs_isinff: @@ -32,13 +32,13 @@ entry: define zeroext i1 @abs_isinf(double %x) { ; P8-LABEL: abs_isinf: ; P8: # %bb.0: # %entry -; P8-NEXT: addis 3, 2, .LCPI1_0@toc@ha -; P8-NEXT: xsabsdp 0, 1 -; P8-NEXT: li 4, 1 -; P8-NEXT: lfs 1, .LCPI1_0@toc@l(3) -; P8-NEXT: li 3, 0 -; P8-NEXT: fcmpu 0, 0, 1 -; P8-NEXT: iseleq 3, 4, 3 +; P8-NEXT: mffprd 3, 1 +; P8-NEXT: li 4, 2047 +; P8-NEXT: rldic 4, 4, 52, 1 +; P8-NEXT: clrldi 3, 3, 1 +; P8-NEXT: xor 3, 3, 4 +; P8-NEXT: cntlzd 3, 3 +; P8-NEXT: rldicl 3, 3, 58, 63 ; P8-NEXT: blr ; ; P9-LABEL: abs_isinf: From c2018fa40fd081a10af4f3294362db9634d9a282 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 5 Sep 2024 09:01:19 -0700 Subject: [PATCH 249/425] [NFC][Support] Refactor FormatVariadic code. (#106610) - Rename `Align` field in ReplacementItem/FmtAlign to `Width` to accurately reflect its use. - Change both `Width` and `Index` in ReplacementItem to 32-bit int instead of size_t (as 64-bits seems excessive in this context). - Eliminate the use of `Empty` ReplacementType, and use the existing std::optional<> instead to indicate that. - Eliminate some boilerplate type code in formatv(). - Eliminate the loop in `splitLiteralAndReplacement`. The existing code will never loop back. - Directly use constructor instead of std::make_pair. --- llvm/include/llvm/Support/FormatCommon.h | 20 ++-- llvm/include/llvm/Support/FormatVariadic.h | 27 ++--- llvm/lib/Support/FormatVariadic.cpp | 108 ++++++++---------- llvm/unittests/Support/FormatVariadicTest.cpp | 40 +++---- 4 files changed, 90 insertions(+), 105 deletions(-) diff --git a/llvm/include/llvm/Support/FormatCommon.h b/llvm/include/llvm/Support/FormatCommon.h index 326e00936aa7c..eaf291b864c5e 100644 --- a/llvm/include/llvm/Support/FormatCommon.h +++ b/llvm/include/llvm/Support/FormatCommon.h @@ -16,15 +16,17 @@ namespace llvm { enum class AlignStyle { Left, Center, Right }; +/// Helper class to format to a \p Width wide field, with alignment \p Where +/// within that field. struct FmtAlign { support::detail::format_adapter &Adapter; AlignStyle Where; - size_t Amount; + unsigned Width; char Fill; FmtAlign(support::detail::format_adapter &Adapter, AlignStyle Where, - size_t Amount, char Fill = ' ') - : Adapter(Adapter), Where(Where), Amount(Amount), Fill(Fill) {} + unsigned Width, char Fill = ' ') + : Adapter(Adapter), Where(Where), Width(Width), Fill(Fill) {} void format(raw_ostream &S, StringRef Options) { // If we don't need to align, we can format straight into the underlying @@ -32,7 +34,7 @@ struct FmtAlign { // in order to calculate how long the output is so we can align it. // TODO: Make the format method return the number of bytes written, that // way we can also skip the intermediate stream for left-aligned output. - if (Amount == 0) { + if (Width == 0) { Adapter.format(S, Options); return; } @@ -40,19 +42,19 @@ struct FmtAlign { raw_svector_ostream Stream(Item); Adapter.format(Stream, Options); - if (Amount <= Item.size()) { + if (Width <= Item.size()) { S << Item; return; } - size_t PadAmount = Amount - Item.size(); + unsigned PadAmount = Width - static_cast(Item.size()); switch (Where) { case AlignStyle::Left: S << Item; fill(S, PadAmount); break; case AlignStyle::Center: { - size_t X = PadAmount / 2; + unsigned X = PadAmount / 2; fill(S, X); S << Item; fill(S, PadAmount - X); @@ -66,8 +68,8 @@ struct FmtAlign { } private: - void fill(llvm::raw_ostream &S, size_t Count) { - for (size_t I = 0; I < Count; ++I) + void fill(llvm::raw_ostream &S, unsigned Count) { + for (unsigned I = 0; I < Count; ++I) S << Fill; } }; diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index f31ad70021579..005d26f02d8fd 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -43,21 +43,20 @@ namespace llvm { -enum class ReplacementType { Empty, Format, Literal }; +enum class ReplacementType { Format, Literal }; struct ReplacementItem { - ReplacementItem() = default; explicit ReplacementItem(StringRef Literal) : Type(ReplacementType::Literal), Spec(Literal) {} - ReplacementItem(StringRef Spec, size_t Index, size_t Align, AlignStyle Where, - char Pad, StringRef Options) - : Type(ReplacementType::Format), Spec(Spec), Index(Index), Align(Align), + ReplacementItem(StringRef Spec, unsigned Index, unsigned Width, + AlignStyle Where, char Pad, StringRef Options) + : Type(ReplacementType::Format), Spec(Spec), Index(Index), Width(Width), Where(Where), Pad(Pad), Options(Options) {} - ReplacementType Type = ReplacementType::Empty; + ReplacementType Type; StringRef Spec; - size_t Index = 0; - size_t Align = 0; + unsigned Index = 0; + unsigned Width = 0; AlignStyle Where = AlignStyle::Right; char Pad = 0; StringRef Options; @@ -81,8 +80,6 @@ class formatv_object_base { void format(raw_ostream &S) const { const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate); for (const auto &R : Replacements) { - if (R.Type == ReplacementType::Empty) - continue; if (R.Type == ReplacementType::Literal) { S << R.Spec; continue; @@ -94,7 +91,7 @@ class formatv_object_base { auto *W = Adapters[R.Index]; - FmtAlign Align(*W, R.Where, R.Align, R.Pad); + FmtAlign Align(*W, R.Where, R.Width, R.Pad); Align.format(S, R.Options); } } @@ -248,14 +245,10 @@ template class formatv_object : public formatv_object_base { // formatv() with validation enable/disable controlled by the first argument. template -inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) - -> formatv_object(Vals))...))> { - using ParamTuple = decltype(std::make_tuple( - support::detail::build_format_adapter(std::forward(Vals))...)); +inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) { auto Params = std::make_tuple( support::detail::build_format_adapter(std::forward(Vals))...); - return formatv_object(Fmt, std::move(Params), Validate); + return formatv_object(Fmt, std::move(Params), Validate); } // formatv() with validation enabled. diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp index 26d2b549136e4..9056466190284 100644 --- a/llvm/lib/Support/FormatVariadic.cpp +++ b/llvm/lib/Support/FormatVariadic.cpp @@ -26,7 +26,7 @@ static std::optional translateLocChar(char C) { } static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad) { + unsigned &Align, char &Pad) { Where = AlignStyle::Right; Align = 0; Pad = ' '; @@ -60,14 +60,14 @@ static std::optional parseReplacementItem(StringRef Spec) { // If the replacement sequence does not start with a non-negative integer, // this is an error. char Pad = ' '; - std::size_t Align = 0; + unsigned Align = 0; AlignStyle Where = AlignStyle::Right; StringRef Options; - size_t Index = 0; + unsigned Index = 0; RepString = RepString.trim(); if (RepString.consumeInteger(0, Index)) { assert(false && "Invalid replacement sequence index!"); - return ReplacementItem{}; + return std::nullopt; } RepString = RepString.trim(); if (RepString.consume_front(",")) { @@ -83,61 +83,50 @@ static std::optional parseReplacementItem(StringRef Spec) { assert(RepString.empty() && "Unexpected characters found in replacement string!"); - return ReplacementItem{Spec, Index, Align, Where, Pad, Options}; + return ReplacementItem(Spec, Index, Align, Where, Pad, Options); } -static std::pair +static std::pair, StringRef> splitLiteralAndReplacement(StringRef Fmt) { - while (!Fmt.empty()) { - // Everything up until the first brace is a literal. - if (Fmt.front() != '{') { - std::size_t BO = Fmt.find_first_of('{'); - return std::make_pair(ReplacementItem{Fmt.substr(0, BO)}, Fmt.substr(BO)); - } - - StringRef Braces = Fmt.take_while([](char C) { return C == '{'; }); - // If there is more than one brace, then some of them are escaped. Treat - // these as replacements. - if (Braces.size() > 1) { - size_t NumEscapedBraces = Braces.size() / 2; - StringRef Middle = Fmt.take_front(NumEscapedBraces); - StringRef Right = Fmt.drop_front(NumEscapedBraces * 2); - return std::make_pair(ReplacementItem{Middle}, Right); - } - // An unterminated open brace is undefined. Assert to indicate that this is - // undefined and that we consider it an error. When asserts are disabled, - // build a replacement item with an error message. - std::size_t BC = Fmt.find_first_of('}'); - if (BC == StringRef::npos) { - assert( - false && - "Unterminated brace sequence. Escape with {{ for a literal brace."); - return std::make_pair( - ReplacementItem{"Unterminated brace sequence. Escape with {{ for a " - "literal brace."}, - StringRef()); - } + assert(!Fmt.empty()); + // Everything up until the first brace is a literal. + if (Fmt.front() != '{') { + size_t BO = Fmt.find_first_of('{'); + return {ReplacementItem{Fmt.substr(0, BO)}, Fmt.substr(BO)}; + } - // Even if there is a closing brace, if there is another open brace before - // this closing brace, treat this portion as literal, and try again with the - // next one. - std::size_t BO2 = Fmt.find_first_of('{', 1); - if (BO2 < BC) - return std::make_pair(ReplacementItem{Fmt.substr(0, BO2)}, - Fmt.substr(BO2)); + StringRef Braces = Fmt.take_while([](char C) { return C == '{'; }); + // If there is more than one brace, then some of them are escaped. Treat + // these as replacements. + if (Braces.size() > 1) { + size_t NumEscapedBraces = Braces.size() / 2; + StringRef Middle = Fmt.take_front(NumEscapedBraces); + StringRef Right = Fmt.drop_front(NumEscapedBraces * 2); + return {ReplacementItem(Middle), Right}; + } + // An unterminated open brace is undefined. Assert to indicate that this is + // undefined and that we consider it an error. When asserts are disabled, + // build a replacement item with an error message. + size_t BC = Fmt.find_first_of('}'); + if (BC == StringRef::npos) { + assert(false && + "Unterminated brace sequence. Escape with {{ for a literal brace."); + return {ReplacementItem("Unterminated brace sequence. Escape with {{ for a " + "literal brace."), + StringRef()}; + } - StringRef Spec = Fmt.slice(1, BC); - StringRef Right = Fmt.substr(BC + 1); + // Even if there is a closing brace, if there is another open brace before + // this closing brace, treat this portion as literal, and try again with the + // next one. + size_t BO2 = Fmt.find_first_of('{', 1); + if (BO2 < BC) + return {ReplacementItem(Fmt.substr(0, BO2)), Fmt.substr(BO2)}; - auto RI = parseReplacementItem(Spec); - if (RI) - return std::make_pair(*RI, Right); + StringRef Spec = Fmt.slice(1, BC); + StringRef Right = Fmt.substr(BC + 1); - // If there was an error parsing the replacement item, treat it as an - // invalid replacement spec, and just continue. - Fmt = Fmt.drop_front(BC + 1); - } - return std::make_pair(ReplacementItem{Fmt}, StringRef()); + return {parseReplacementItem(Spec), Right}; } #ifndef NDEBUG @@ -153,17 +142,18 @@ formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs, #if ENABLE_VALIDATION const StringRef SavedFmtStr = Fmt; - size_t NumExpectedArgs = 0; + unsigned NumExpectedArgs = 0; #endif while (!Fmt.empty()) { - ReplacementItem I; + std::optional I; std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt); - if (I.Type != ReplacementType::Empty) - Replacements.push_back(I); + if (!I) + continue; + Replacements.emplace_back(*I); #if ENABLE_VALIDATION - if (I.Type == ReplacementType::Format) - NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1); + if (I->Type == ReplacementType::Format) + NumExpectedArgs = std::max(NumExpectedArgs, I->Index + 1); #endif } @@ -195,7 +185,7 @@ formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs, // Find the number of unique indices seen. All replacement indices // are < NumExpectedArgs. SmallVector Indices(NumExpectedArgs); - size_t Count = 0; + unsigned Count = 0; for (const ReplacementItem &I : Replacements) { if (I.Type != ReplacementType::Format || Indices[I.Index]) continue; diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index 7f1e09b1857dd..4f3d1791c0018 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -87,14 +87,14 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ("", Replacements[0].Options); Replacements = parseFormatString("{1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(1u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ("", Replacements[0].Options); @@ -103,7 +103,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ("", Replacements[0].Options); @@ -112,7 +112,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Left, Replacements[0].Where); EXPECT_EQ("", Replacements[0].Options); @@ -121,7 +121,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Center, Replacements[0].Where); EXPECT_EQ("", Replacements[0].Options); @@ -130,7 +130,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ("foo", Replacements[0].Options); @@ -139,7 +139,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Left, Replacements[0].Where); EXPECT_EQ("foo", Replacements[0].Options); @@ -148,7 +148,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Left, Replacements[0].Where); EXPECT_EQ("foo", Replacements[0].Options); @@ -159,7 +159,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0:0:1", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ("0:1", Replacements[0].Options); @@ -169,7 +169,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0,p+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(4u, Replacements[0].Align); + EXPECT_EQ(4u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ('p', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); @@ -180,7 +180,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0,-+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(4u, Replacements[0].Align); + EXPECT_EQ(4u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ('-', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); @@ -190,7 +190,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0,+-4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(4u, Replacements[0].Align); + EXPECT_EQ(4u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Left, Replacements[0].Where); EXPECT_EQ('+', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); @@ -200,7 +200,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0,==4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(4u, Replacements[0].Align); + EXPECT_EQ(4u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Center, Replacements[0].Where); EXPECT_EQ('=', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); @@ -210,7 +210,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0,:=4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(4u, Replacements[0].Align); + EXPECT_EQ(4u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Center, Replacements[0].Where); EXPECT_EQ(':', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); @@ -222,7 +222,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ("", Replacements[0].Options); // Including if the colon is present but contains no text. @@ -230,7 +230,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(3u, Replacements[0].Align); + EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ("", Replacements[0].Options); // 3. If alignment is missing, it defaults to 0, right, space @@ -240,7 +240,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ(' ', Replacements[0].Pad); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ("foo", Replacements[0].Options); } @@ -250,7 +250,7 @@ TEST(FormatVariadicTest, MultipleReplacements) { // {0} EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); - EXPECT_EQ(0u, Replacements[0].Align); + EXPECT_EQ(0u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); EXPECT_EQ("", Replacements[0].Options); @@ -261,7 +261,7 @@ TEST(FormatVariadicTest, MultipleReplacements) { // {1:foo} - Options=foo EXPECT_EQ(ReplacementType::Format, Replacements[2].Type); EXPECT_EQ(1u, Replacements[2].Index); - EXPECT_EQ(0u, Replacements[2].Align); + EXPECT_EQ(0u, Replacements[2].Width); EXPECT_EQ(AlignStyle::Right, Replacements[2].Where); EXPECT_EQ("foo", Replacements[2].Options); @@ -272,7 +272,7 @@ TEST(FormatVariadicTest, MultipleReplacements) { // {2:bar,-3} - Options=bar, Align=-3 EXPECT_EQ(ReplacementType::Format, Replacements[4].Type); EXPECT_EQ(2u, Replacements[4].Index); - EXPECT_EQ(3u, Replacements[4].Align); + EXPECT_EQ(3u, Replacements[4].Width); EXPECT_EQ(AlignStyle::Left, Replacements[4].Where); EXPECT_EQ("bar", Replacements[4].Options); } From be1958fd487dd58532a45b40be4a7152b80ec31a Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 5 Sep 2024 17:02:48 +0100 Subject: [PATCH 250/425] [LLVM][CodeGen][SVE] Implement nxvbf16 fpextend to nxvf32/nxvf64. (#107253) NOTE: There are no dedicated SVE instructions but bf16->f32 is just a left shift because they share the same exponent range and from there other convert instructions can be used. --- .../Target/AArch64/AArch64ISelLowering.cpp | 23 ++++- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 +- .../test/CodeGen/AArch64/sve-bf16-converts.ll | 89 +++++++++++++++++++ 3 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-converts.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d1ddbfa300846..c0671dd1f0087 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1663,6 +1663,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); @@ -4298,8 +4299,28 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.isScalableVector()) + if (VT.isScalableVector()) { + SDValue SrcVal = Op.getOperand(0); + + if (SrcVal.getValueType().getScalarType() == MVT::bf16) { + // bf16 and f32 share the same exponent range so the conversion requires + // them to be aligned with the new mantissa bits zero'd. This is just a + // left shift that is best to isel directly. + if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32) + return Op; + + if (VT != MVT::nxv2f64) + return SDValue(); + + // Break other conversions in two with the first part converting to f32 + // and the second using native f32->VT instructions. + SDLoc DL(Op); + return DAG.getNode(ISD::FP_EXTEND, DL, VT, + DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal)); + } + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); + } if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPExtendToSVE(Op, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 4922fb280333b..692cd66d38437 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2320,7 +2320,12 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive:$Pg)), nxv2f32:$Zs, (i64 timm0_1), nxv2f16:$Zd)), (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - // Signed integer -> Floating-point + def : Pat<(nxv4f32 (fpextend nxv4bf16:$op)), + (LSL_ZZI_S $op, (i32 16))>; + def : Pat<(nxv2f32 (fpextend nxv2bf16:$op)), + (LSL_ZZI_S $op, (i32 16))>; + + // Signed integer -> Floating-point def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), (sext_inreg nxv2i64:$Zs, nxv2i16), nxv2f16:$Zd)), (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll new file mode 100644 index 0000000000000..d72f92c1dac1f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @fpext_nxv2bf16_to_nxv2f32( %a) { +; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} + +define @fpext_nxv4bf16_to_nxv4f32( %a) { +; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} + +define @fpext_nxv8bf16_to_nxv8f32( %a) { +; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: lsl z0.s, z1.s, #16 +; CHECK-NEXT: lsl z1.s, z2.s, #16 +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} + +define @fpext_nxv2bf16_to_nxv2f64( %a) { +; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} + +define @fpext_nxv4bf16_to_nxv4f64( %a) { +; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z0.s, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvt z0.d, p0/m, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvt z1.d, p0/m, z2.s +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} + +define @fpext_nxv8bf16_to_nxv8f64( %a) { +; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z2.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z2.s, #16 +; CHECK-NEXT: lsl z3.s, z3.s, #16 +; CHECK-NEXT: lsl z4.s, z0.s, #16 +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvt z0.d, p0/m, z2.s +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: fcvt z2.d, p0/m, z3.s +; CHECK-NEXT: movprfx z3, z4 +; CHECK-NEXT: fcvt z3.d, p0/m, z4.s +; CHECK-NEXT: ret + %res = fpext %a to + ret %res +} From 62e6c1ead7aedfbf973fb667537ff5cee4988da1 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 5 Sep 2024 12:29:46 -0400 Subject: [PATCH 251/425] [lld/mac] Allow -segprot having stricter initprot than maxprot on mac (#107269) ...including for catalyst. The usecase for this is to put certain security-critical variables into a special segment/section that's mapped as read-only most of the time, and that temporary gets remapped as writeable when these variables are written to be the program. This protects against them being written to by heap spraying attacks. This special section should be mapped as read-only at program start, so using `-segprot MY_PROTECTED_MEMORY_THINGER rw r` to mark that segment as rw maxprot and r initprot is exactly what we want. lld has so far rejected mismatching initprot and maxprot. ld64 doesn't reject this, but silently writes initprot into both fields (!) It looks like this might not be fully intentional, see https://crbug.com/41495919#comment5 and http://crbug.com/41495919#comment8. In any case, when postprocessing ld64's output to have different values for initprot and maxprot, the dynamic loader seems to do the right thing (see also the previous two links). The same technique also works on Windows, using both link.exe and lld-link.exe using `/SECTION:myprotsect,R`. So, since this is useful, allow it when targeting macOS, and make it do what you'd expect. Since loader support for this on iOS is less clear, keep disallowing it there for now. See the PR for the program I used to check that this seems to work. (I only checked on arm64 macOS 14.5 so far; will run this on many more systems on bots once this is merged and rolled in.) --- lld/MachO/Driver.cpp | 18 +++++++++++++--- lld/MachO/OutputSegment.cpp | 6 ++++++ lld/test/MachO/segprot.s | 41 +++++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 6a1ff96ed6569..09a539d71dab3 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1882,9 +1882,21 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, StringRef segName = arg->getValue(0); uint32_t maxProt = parseProtection(arg->getValue(1)); uint32_t initProt = parseProtection(arg->getValue(2)); - if (maxProt != initProt && config->arch() != AK_i386) - error("invalid argument '" + arg->getAsString(args) + - "': max and init must be the same for non-i386 archs"); + + // FIXME: Check if this works on more platforms. + bool allowsDifferentInitAndMaxProt = + config->platform() == PLATFORM_MACOS || + config->platform() == PLATFORM_MACCATALYST; + if (allowsDifferentInitAndMaxProt) { + if (initProt > maxProt) + error("invalid argument '" + arg->getAsString(args) + + "': init must not be more permissive than max"); + } else { + if (maxProt != initProt && config->arch() != AK_i386) + error("invalid argument '" + arg->getAsString(args) + + "': max and init must be the same for non-macOS non-i386 archs"); + } + if (segName == segment_names::linkEdit) error("-segprot cannot be used to change __LINKEDIT's protections"); config->segmentProtections.push_back({segName, maxProt, initProt}); diff --git a/lld/MachO/OutputSegment.cpp b/lld/MachO/OutputSegment.cpp index 3d8a8eb61a9bb..c320af3fb3177 100644 --- a/lld/MachO/OutputSegment.cpp +++ b/lld/MachO/OutputSegment.cpp @@ -42,6 +42,12 @@ static uint32_t initProt(StringRef name) { static uint32_t maxProt(StringRef name) { assert(config->arch() != AK_i386 && "TODO: i386 has different maxProt requirements"); + auto it = find_if( + config->segmentProtections, + [&](const SegmentProtection &segprot) { return segprot.name == name; }); + if (it != config->segmentProtections.end()) + return it->maxProt; + return initProt(name); } diff --git a/lld/test/MachO/segprot.s b/lld/test/MachO/segprot.s index a4e91d1336105..a5ca8342b41aa 100644 --- a/lld/test/MachO/segprot.s +++ b/lld/test/MachO/segprot.s @@ -2,7 +2,10 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o ## Make sure the option parser doesn't think --x and -w are flags. -# RUN: %lld -dylib -o %t %t.o -segprot FOO rwx xwr -segprot BAR --x --x -segprot BAZ -w -w +# RUN: %lld -dylib -o %t %t.o \ +# RUN: -segprot FOO rwx xwr \ +# RUN: -segprot BAR --x --x \ +# RUN: -segprot BAZ -w -w # RUN: llvm-readobj --macho-segment %t | FileCheck %s # CHECK: Name: FOO @@ -32,12 +35,38 @@ # CHECK-NEXT: maxprot: -w- # CHECK-NEXT: initprot: -w- -# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx rw 2>&1 | FileCheck %s --check-prefix=MISMATCH -# RUN: not %lld -dylib -o /dev/null %t.o -segprot __LINKEDIT rwx rwx 2>&1 | FileCheck %s --check-prefix=NO-LINKEDIT -# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO uhh wat 2>&1 | FileCheck %s --check-prefix=MISPARSE -# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx 2>&1 | FileCheck %s --check-prefix=MISSING +# RUN: %lld -dylib -o %t.different %t.o -segprot FOO rw r +# RUN: llvm-readobj --macho-segment %t.different \ +# RUN: | FileCheck %s --check-prefix=DIFFERENT -# MISMATCH: error: invalid argument '-segprot FOO rwx rw': max and init must be the same for non-i386 archs +# RUN: %no-arg-lld -arch x86_64 -platform_version "mac catalyst" 14.0.0 17.5 \ +# RUN: -dylib -o /dev/null %t.o -segprot FOO rw r +# RUN: llvm-readobj --macho-segment %t.different \ +# RUN: | FileCheck %s --check-prefix=DIFFERENT + +# DIFFERENT: Name: FOO +# DIFFERENT-NEXT: Size: +# DIFFERENT-NEXT: vmaddr: +# DIFFERENT-NEXT: vmsize: +# DIFFERENT-NEXT: fileoff: +# DIFFERENT-NEXT: filesize: +# DIFFERENT-NEXT: maxprot: rw- +# DIFFERENT-NEXT: initprot: r-- + +# RUN: not %no-arg-lld -arch x86_64 -platform_version ios-simulator 14.0 15.0 \ +# RUN: -dylib -o /dev/null %t.o -segprot FOO rwx rw 2>&1 \ +# RUN: | FileCheck %s --check-prefix=MISMATCH +# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO r rw 2>&1 \ +# RUN: | FileCheck %s --check-prefix=INITTOOPERMISSIVE +# RUN: not %lld -dylib -o /dev/null %t.o -segprot __LINKEDIT rwx rwx 2>&1 \ +# RUN: | FileCheck %s --check-prefix=NO-LINKEDIT +# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO uhh wat 2>&1 \ +# RUN: | FileCheck %s --check-prefix=MISPARSE +# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx 2>&1 \ +# RUN: | FileCheck %s --check-prefix=MISSING + +# MISMATCH: error: invalid argument '-segprot FOO rwx rw': max and init must be the same for non-macOS non-i386 archs +# INITTOOPERMISSIVE: error: invalid argument '-segprot FOO r rw': init must not be more permissive than max # NO-LINKEDIT: error: -segprot cannot be used to change __LINKEDIT's protections # MISPARSE: error: unknown -segprot letter 'u' in uhh # MISPARSE: error: unknown -segprot letter 'a' in wat From ce3648094d44e8c098396a353b215acecb363cda Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 6 Sep 2024 00:31:01 +0800 Subject: [PATCH 252/425] [RISCV] Update V0Defs after moving Src in peepholes (#107359) If we move a pseudo in tryReduceVL or foldVMV_V_V via ensureDominates, its V0 definition may have changed so we need to update V0Defs. This shouldn't have any functional change today since any pseudo which uses V0 won't be able to move past a new definition. However this will matter if we add a peephole to convert unmasked pseudos to masked pseudos and add a use of V0. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index a612a03106f02..db8e496493c41 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -61,7 +61,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { } private: - bool tryToReduceVL(MachineInstr &MI) const; + bool tryToReduceVL(MachineInstr &MI); bool convertToVLMAX(MachineInstr &MI) const; bool convertToWholeRegister(MachineInstr &MI) const; bool convertToUnmasked(MachineInstr &MI) const; @@ -72,7 +72,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool hasSameEEW(const MachineInstr &User, const MachineInstr &Src) const; bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; - bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; + bool ensureDominates(const MachineOperand &Use, MachineInstr &Src); /// Maps uses of V0 to the corresponding def of V0. DenseMap V0Defs; @@ -115,7 +115,7 @@ bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User, // Attempt to reduce the VL of an instruction whose sole use is feeding a // instruction with a narrower VL. This currently works backwards from the // user instruction (which might have a smaller VL). -bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { +bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) { // Note that the goal here is a bit multifaceted. // 1) For store's reducing the VL of the value being stored may help to // reduce VL toggles. This is somewhat of an artifact of the fact we @@ -465,17 +465,18 @@ static bool dominates(MachineBasicBlock::const_iterator A, /// does. Returns false if doesn't dominate and we can't move. \p MO must be in /// the same basic block as \Src. bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO, - MachineInstr &Src) const { + MachineInstr &Src) { assert(MO.getParent()->getParent() == Src.getParent()); if (!MO.isReg() || MO.getReg() == RISCV::NoRegister) return true; MachineInstr *Def = MRI->getVRegDef(MO.getReg()); if (Def->getParent() == Src.getParent() && !dominates(Def, Src)) { - if (!isSafeToMove(Src, *Def->getNextNode())) + MachineInstr *AfterDef = Def->getNextNode(); + if (!isSafeToMove(Src, *AfterDef)) return false; - // FIXME: Update V0Defs - Src.moveBefore(Def->getNextNode()); + V0Defs[&Src] = V0Defs[AfterDef]; + Src.moveBefore(AfterDef); } return true; From 8e28f0471b20ed1148951bc7ffe5c503c43692ae Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 5 Sep 2024 09:38:05 -0700 Subject: [PATCH 253/425] [libc] Correct the entrypoints list for ARM/darwin (#107331) These entrypoints were added to every target without testing. They don't work on ARM macs. --- libc/config/darwin/arm/entrypoints.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt index 36da9e1313663..a012504daa5c5 100644 --- a/libc/config/darwin/arm/entrypoints.txt +++ b/libc/config/darwin/arm/entrypoints.txt @@ -94,16 +94,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdlib.calloc libc.src.stdlib.realloc libc.src.stdlib.free - - # stdio.h external entrypoints - libc.src.stdio.snprintf - libc.src.stdio.sprintf - libc.src.stdio.asprintf - libc.src.stdio.asprintf - libc.src.stdio.vprintf - libc.src.stdio.vsnprintf - libc.src.stdio.vsprintf - libc.src.stdio.vasprintf ) set(TARGET_LIBM_ENTRYPOINTS @@ -148,9 +138,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.cospif libc.src.math.dfmal libc.src.math.dsqrtl - libc.src.math.daddl libc.src.math.ddivl - libc.src.math.dsubl libc.src.math.erff libc.src.math.exp libc.src.math.expf From 2ed510dc9789ca0b9172f0593527bee9d53496c4 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Thu, 5 Sep 2024 09:38:45 -0700 Subject: [PATCH 254/425] [LLDB][Minidump] Extend the minidump x86_64 registers to include fs_base and gs_base (#106767) A follow up to #106473 Minidump wasn't collecting fs or gs_base. This patch extends the x86_64 register context and gated reading it behind an lldb specific flag. Additionally these registers are explicitly checked in the tests. --- .../ObjectFile/Minidump/MinidumpFileBuilder.cpp | 5 ++++- .../minidump/RegisterContextMinidump_x86_64.cpp | 8 ++++++++ .../minidump/RegisterContextMinidump_x86_64.h | 8 +++++++- .../TestProcessSaveCoreMinidump.py | 12 ++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index 13355afb58dbd..5c9ba223ad143 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -473,7 +473,8 @@ GetThreadContext_x86_64(RegisterContext *reg_ctx) { lldb_private::minidump::MinidumpContext_x86_64_Flags::x86_64_Flag | lldb_private::minidump::MinidumpContext_x86_64_Flags::Control | lldb_private::minidump::MinidumpContext_x86_64_Flags::Segments | - lldb_private::minidump::MinidumpContext_x86_64_Flags::Integer); + lldb_private::minidump::MinidumpContext_x86_64_Flags::Integer | + lldb_private::minidump::MinidumpContext_x86_64_Flags::LLDBSpecific); thread_context.rax = read_register_u64(reg_ctx, "rax"); thread_context.rbx = read_register_u64(reg_ctx, "rbx"); thread_context.rcx = read_register_u64(reg_ctx, "rcx"); @@ -499,6 +500,8 @@ GetThreadContext_x86_64(RegisterContext *reg_ctx) { thread_context.gs = read_register_u64(reg_ctx, "gs"); thread_context.ss = read_register_u64(reg_ctx, "ss"); thread_context.ds = read_register_u64(reg_ctx, "ds"); + thread_context.fs_base = read_register_u64(reg_ctx, "fs_base"); + thread_context.gs_base = read_register_u64(reg_ctx, "gs_base"); return thread_context; } diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp index 917140cab2976..e879c49315659 100644 --- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp +++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp @@ -67,6 +67,7 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64( auto ControlFlag = MinidumpContext_x86_64_Flags::Control; auto IntegerFlag = MinidumpContext_x86_64_Flags::Integer; auto SegmentsFlag = MinidumpContext_x86_64_Flags::Segments; + auto LLDBSpecificFlag = MinidumpContext_x86_64_Flags::LLDBSpecific; if ((context_flags & x86_64_Flag) != x86_64_Flag) return nullptr; @@ -104,6 +105,13 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64( writeRegister(&context->r15, result_base, reg_info[lldb_r15_x86_64]); } + if ((context_flags & LLDBSpecificFlag) == LLDBSpecificFlag) { + writeRegister(&context->fs_base, result_base, + reg_info[x86_64_with_base::lldb_fs_base]); + writeRegister(&context->gs_base, result_base, + reg_info[x86_64_with_base::lldb_gs_base]); + } + // TODO parse the floating point registers return result_context_buf; diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h index d920ea9d823f4..f8d38e4ba5378 100644 --- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h +++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h @@ -153,6 +153,11 @@ struct MinidumpContext_x86_64 { llvm::support::ulittle64_t last_branch_from_rip; llvm::support::ulittle64_t last_exception_to_rip; llvm::support::ulittle64_t last_exception_from_rip; + + // LLDB can save core files and save extra information that isn't available + // from Google breakpad, or similar, minidump files. + llvm::support::ulittle64_t fs_base; + llvm::support::ulittle64_t gs_base; }; // For context_flags. These values indicate the type of @@ -168,9 +173,10 @@ enum class MinidumpContext_x86_64_Flags : uint32_t { FloatingPoint = x86_64_Flag | 0x00000008, DebugRegisters = x86_64_Flag | 0x00000010, XState = x86_64_Flag | 0x00000040, + LLDBSpecific = x86_64_Flag | 0x80000000, Full = Control | Integer | FloatingPoint, - All = Full | Segments | DebugRegisters, + All = Full | Segments | DebugRegisters | LLDBSpecific, LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ All) }; diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index ea59aef004aff..ed15793b527fc 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -67,6 +67,18 @@ def verify_core_file( self.assertIn(thread_id, stacks_to_registers_map) register_val_list = stacks_to_registers_map[thread_id] frame_register_list = frame.GetRegisters() + # explicitly verify we collected fs and gs base for x86_64 + explicit_registers = ["fs_base", "gs_base"] + for reg in explicit_registers: + register = frame_register_list.GetFirstValueByName(reg) + self.assertNotEqual(None, register) + self.assertEqual( + register.GetValueAsUnsigned(), + stacks_to_registers_map[thread_id] + .GetFirstValueByName("fs_base") + .GetValueAsUnsigned(), + ) + for x in register_val_list: self.assertEqual( x.GetValueAsUnsigned(), From 953af0e7f1bcb42136be1a0ea9cdd5aa1fb74852 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 5 Sep 2024 12:39:05 -0400 Subject: [PATCH 255/425] [libc++][NFC] Increase consistency for namespace closing comments --- libcxx/include/__mdspan/extents.h | 2 +- libcxx/src/filesystem/error.h | 2 +- libcxx/src/filesystem/file_descriptor.h | 2 +- libcxx/src/filesystem/format_string.h | 2 +- libcxx/src/filesystem/operations.cpp | 6 +++--- libcxx/src/filesystem/posix_compat.h | 2 +- libcxx/src/filesystem/time_utils.h | 2 +- libcxx/src/include/atomic_support.h | 2 +- libcxx/src/memory_resource.cpp | 2 +- libcxx/src/system_error.cpp | 2 +- libcxx/test/benchmarks/ContainerBenchmarks.h | 2 +- libcxx/test/benchmarks/VariantBenchmarks.h | 2 +- .../fs.op.last_write_time/last_write_time.pass.cpp | 2 +- .../meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp | 2 +- .../tuple/tuple.tuple/tuple.apply/apply.pass.cpp | 2 +- libcxx/test/support/archetypes.h | 8 ++++---- libcxx/test/support/container_test_types.h | 4 ++-- libcxx/test/support/filesystem_test_helper.h | 2 +- libcxx/test/support/make_test_thread.h | 2 +- libcxx/test/support/parse_integer.h | 2 +- libcxx/test/support/uses_alloc_types.h | 2 +- libcxxabi/src/cxa_guard_impl.h | 4 ++-- libcxxabi/src/cxa_personality.cpp | 2 +- 23 files changed, 30 insertions(+), 30 deletions(-) diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h index 8fcfb4b09a2b1..3d2c2771a834b 100644 --- a/libcxx/include/__mdspan/extents.h +++ b/libcxx/include/__mdspan/extents.h @@ -449,7 +449,7 @@ struct __make_dextents< _IndexType, 0, extents<_IndexType, _ExtentsPack...>> { using type = extents<_IndexType, _ExtentsPack...>; }; -} // end namespace __mdspan_detail +} // namespace __mdspan_detail // [mdspan.extents.dextents], alias template template diff --git a/libcxx/src/filesystem/error.h b/libcxx/src/filesystem/error.h index 572cc73292a19..09020fbede9b9 100644 --- a/libcxx/src/filesystem/error.h +++ b/libcxx/src/filesystem/error.h @@ -225,7 +225,7 @@ struct ErrorHandler { ErrorHandler& operator=(ErrorHandler const&) = delete; }; -} // end namespace detail +} // namespace detail _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/file_descriptor.h b/libcxx/src/filesystem/file_descriptor.h index 50178ff84e03f..f86eb60d77808 100644 --- a/libcxx/src/filesystem/file_descriptor.h +++ b/libcxx/src/filesystem/file_descriptor.h @@ -284,7 +284,7 @@ inline file_status FileDescriptor::refresh_status(error_code& ec) { return m_status; } -} // end namespace detail +} // namespace detail _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/format_string.h b/libcxx/src/filesystem/format_string.h index a44def86f53e9..81c5a95ae31e5 100644 --- a/libcxx/src/filesystem/format_string.h +++ b/libcxx/src/filesystem/format_string.h @@ -70,7 +70,7 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) string format_string(const cha return ret; } -} // end namespace detail +} // namespace detail _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp index a83c1ae15a4a4..d771f20097352 100644 --- a/libcxx/src/filesystem/operations.cpp +++ b/libcxx/src/filesystem/operations.cpp @@ -254,7 +254,7 @@ bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_cod #endif // copy_file_impl implementation } // end anonymous namespace -} // end namespace detail +} // namespace detail bool __copy_file(const path& from, const path& to, copy_options options, error_code* ec) { using detail::FileDescriptor; @@ -732,7 +732,7 @@ uintmax_t remove_all_impl(path const& p, error_code& ec) { return count; } -} // end namespace +} // namespace uintmax_t __remove_all(const path& p, error_code* ec) { ErrorHandler err("remove_all", ec, &p); @@ -827,7 +827,7 @@ uintmax_t remove_all_impl(int parent_directory, const path& p, error_code& ec) { return 0; } -} // end namespace +} // namespace uintmax_t __remove_all(const path& p, error_code* ec) { ErrorHandler err("remove_all", ec, &p); diff --git a/libcxx/src/filesystem/posix_compat.h b/libcxx/src/filesystem/posix_compat.h index 760cdb65dae1d..b41c004341af3 100644 --- a/libcxx/src/filesystem/posix_compat.h +++ b/libcxx/src/filesystem/posix_compat.h @@ -490,7 +490,7 @@ using SSizeT = ::ssize_t; #endif -} // end namespace detail +} // namespace detail _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/time_utils.h b/libcxx/src/filesystem/time_utils.h index e05f252868f03..13f0d6f3b726e 100644 --- a/libcxx/src/filesystem/time_utils.h +++ b/libcxx/src/filesystem/time_utils.h @@ -344,7 +344,7 @@ inline file_time_type __extract_last_write_time(const path& p, const StatT& st, #endif // !_LIBCPP_HAS_NO_FILESYSTEM -} // end namespace detail +} // namespace detail _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/include/atomic_support.h b/libcxx/src/include/atomic_support.h index 9ce41b3229f5e..c4bc34ffc1cd2 100644 --- a/libcxx/src/include/atomic_support.h +++ b/libcxx/src/include/atomic_support.h @@ -125,7 +125,7 @@ __libcpp_atomic_compare_exchange(_ValueType* __val, _ValueType* __expected, _Val #endif // _LIBCPP_HAS_NO_THREADS -} // end namespace +} // namespace _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp index 2117238e63487..d2ff3509c5a31 100644 --- a/libcxx/src/memory_resource.cpp +++ b/libcxx/src/memory_resource.cpp @@ -82,7 +82,7 @@ union ResourceInitHelper { // attribute with a value that's reserved for the implementation (we're the implementation). #include "memory_resource_init_helper.h" -} // end namespace +} // namespace memory_resource* new_delete_resource() noexcept { return &res_init.resources.new_delete_res; } diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp index f518b480a2782..3367bd56bd788 100644 --- a/libcxx/src/system_error.cpp +++ b/libcxx/src/system_error.cpp @@ -110,7 +110,7 @@ string make_error_str(const error_code& ec) { } return string(); } -} // end namespace +} // namespace string __do_message::message(int ev) const { #if defined(_LIBCPP_HAS_NO_THREADS) diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h index 744505b439985..5404814e7599b 100644 --- a/libcxx/test/benchmarks/ContainerBenchmarks.h +++ b/libcxx/test/benchmarks/ContainerBenchmarks.h @@ -223,6 +223,6 @@ static void BM_Compare_different_containers(benchmark::State& st, Container, Gen } } -} // end namespace ContainerBenchmarks +} // namespace ContainerBenchmarks #endif // BENCHMARK_CONTAINER_BENCHMARKS_H diff --git a/libcxx/test/benchmarks/VariantBenchmarks.h b/libcxx/test/benchmarks/VariantBenchmarks.h index a8e9c9febd728..ad36b59d2e879 100644 --- a/libcxx/test/benchmarks/VariantBenchmarks.h +++ b/libcxx/test/benchmarks/VariantBenchmarks.h @@ -50,6 +50,6 @@ static void BM_Visit(benchmark::State& state) { } } -} // end namespace VariantBenchmarks +} // namespace VariantBenchmarks #endif // BENCHMARK_VARIANT_BENCHMARKS_H diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp index c1816a94fd2b6..7024e50e81209 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp @@ -300,7 +300,7 @@ static const bool SupportsMinRoundTrip = [] { return min_val == file_time_type::min(); }(); -} // end namespace +} // namespace static bool CompareTime(TimeSpec t1, TimeSpec t2) { if (SupportsNanosecondRoundTrip) diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp index 925ddad28136b..cb7614224342f 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp @@ -61,7 +61,7 @@ struct AmbiguousSwap {}; template void swap(T&, T&) {} -} // end namespace MyNS2 +} // namespace MyNS2 int main(int, char**) { diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp index af37527522075..ebcbcfeffc022 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp @@ -247,7 +247,7 @@ namespace ReturnTypeTest { using InvokeResult = decltype(std::apply(fn, t)); static_assert(std::is_same::value, ""); } -} // end namespace ReturnTypeTest +} // namespace ReturnTypeTest void test_return_type() { diff --git a/libcxx/test/support/archetypes.h b/libcxx/test/support/archetypes.h index 53545f687970e..259346e90a510 100644 --- a/libcxx/test/support/archetypes.h +++ b/libcxx/test/support/archetypes.h @@ -328,7 +328,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept { return L.value != R.value; } -} // end namespace ConstexprTestTypes +} // namespace ConstexprTestTypes //============================================================================// @@ -351,7 +351,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept { return L.value != R.value; } -} // end namespace ExplicitConstexprTestTypes +} // namespace ExplicitConstexprTestTypes //============================================================================// @@ -373,7 +373,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept { return L.value != R.value; } -} // end namespace TrivialTestTypes +} // namespace TrivialTestTypes //============================================================================// // @@ -395,7 +395,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept { return L.value != R.value; } -} // end namespace ExplicitTrivialTestTypes +} // namespace ExplicitTrivialTestTypes #endif // TEST_STD_VER >= 11 diff --git a/libcxx/test/support/container_test_types.h b/libcxx/test/support/container_test_types.h index b72bbbeaccf77..7a97e04398054 100644 --- a/libcxx/test/support/container_test_types.h +++ b/libcxx/test/support/container_test_types.h @@ -349,7 +349,7 @@ typedef std::allocator_traits A2T; static_assert(std::is_same, A2T>::value, ""); static_assert(std::is_same, A1T>::value, ""); -} // end namespace test_detail +} // namespace test_detail //===----------------------------------------------------------------------===// // 'CopyInsertable', 'MoveInsertable' and 'EmplaceConstructible' test types @@ -491,6 +491,6 @@ template > using multiset = std::multiset, ContainerTestAllocator >; -} // end namespace TCT +} // namespace TCT #endif // SUPPORT_CONTAINER_TEST_TYPES_H diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h index 29bc846fbbc83..932d14f45f8e5 100644 --- a/libcxx/test/support/filesystem_test_helper.h +++ b/libcxx/test/support/filesystem_test_helper.h @@ -146,7 +146,7 @@ namespace utils { struct ::stat tmp; return ::stat(path.c_str(), &tmp) == 0; } -} // end namespace utils +} // namespace utils struct scoped_test_env { diff --git a/libcxx/test/support/make_test_thread.h b/libcxx/test/support/make_test_thread.h index cd548fd909d71..00190a8a69ce1 100644 --- a/libcxx/test/support/make_test_thread.h +++ b/libcxx/test/support/make_test_thread.h @@ -46,6 +46,6 @@ TEST_AVAILABILITY_SYNC std::jthread make_test_jthread(F&& f, Args&&... args) { } #endif -} // end namespace support +} // namespace support #endif // TEST_SUPPORT_MAKE_TEST_THREAD_H diff --git a/libcxx/test/support/parse_integer.h b/libcxx/test/support/parse_integer.h index c4fa429e65d57..36f8f7b79881b 100644 --- a/libcxx/test/support/parse_integer.h +++ b/libcxx/test/support/parse_integer.h @@ -62,7 +62,7 @@ struct parse_integer_impl { return std::stoull(str); } }; -} // end namespace detail +} // namespace detail template T parse_integer(std::basic_string const& str) { diff --git a/libcxx/test/support/uses_alloc_types.h b/libcxx/test/support/uses_alloc_types.h index 817bef1da3aef..66746960fc87c 100644 --- a/libcxx/test/support/uses_alloc_types.h +++ b/libcxx/test/support/uses_alloc_types.h @@ -120,7 +120,7 @@ using IdentityT = typename Identity::type; template using EnableIfB = typename std::enable_if::type; -} // end namespace detail +} // namespace detail using detail::EnableIfB; diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h index 320501cb85938..3e533054098e2 100644 --- a/libcxxabi/src/cxa_guard_impl.h +++ b/libcxxabi/src/cxa_guard_impl.h @@ -676,8 +676,8 @@ static_assert(CurrentImplementation != Implementation::Futex || PlatformSupports using SelectedImplementation = SelectImplementation::type; -} // end namespace -} // end namespace __cxxabiv1 +} // namespace +} // namespace __cxxabiv1 #if defined(__clang__) # pragma clang diagnostic pop diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp index 843a18a4cbd8a..5f6e75c5be19c 100644 --- a/libcxxabi/src/cxa_personality.cpp +++ b/libcxxabi/src/cxa_personality.cpp @@ -167,7 +167,7 @@ uintptr_t readPointerHelper(const uint8_t*& p) { return static_cast(value); } -} // end namespace +} // namespace extern "C" { From eee2f02e4e28e54e5a38a1dbbd62ea6780909e16 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 5 Sep 2024 12:42:55 -0400 Subject: [PATCH 256/425] [libc++][modules] Get rid of dependency in __datasizeof (#107394) All the compilers we support also provide __builtin_offsetof, so avoid using this macro and use the builtin directly instead. This allows removing a dependency on ``, which is heavier than we need. --- libcxx/include/__type_traits/datasizeof.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h index b4cbd1ddfa8de..0c1ed94f84029 100644 --- a/libcxx/include/__type_traits/datasizeof.h +++ b/libcxx/include/__type_traits/datasizeof.h @@ -10,9 +10,9 @@ #define _LIBCPP___TYPE_TRAITS_DATASIZEOF_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_class.h> #include <__type_traits/is_final.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -37,15 +37,15 @@ struct _FirstPaddingByte { char __first_padding_byte_; }; -// _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow -// the use as an extension. +// _FirstPaddingByte<> is sometimes non-standard layout. +// It is conditionally-supported to use __builtin_offsetof in that case, but GCC and Clang allow it. _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof") _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Winvalid-offsetof") template -inline const size_t __datasizeof_v = offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_); +inline const size_t __datasizeof_v = __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_); _LIBCPP_DIAGNOSTIC_POP -#endif // __has_extension(datasizeof) +#endif // __has_extension(datasizeof) _LIBCPP_END_NAMESPACE_STD From 5edede2db09d38cbf9397edb9bfd43b92265f660 Mon Sep 17 00:00:00 2001 From: Tim Gymnich Date: Thu, 5 Sep 2024 18:46:14 +0200 Subject: [PATCH 257/425] [DXIL] Add sign intrinsic part 2 (#101988) makes progress on #70078 ### Changes - Added `int_dx_sign` intrinsic in `IntrinsicsDirectX.td` - Added expansion for `int_dx_sign in `DXILIntrinsicExpansion.cpp` - Added DXIL backend test case ### Related PRs - https://github.com/llvm/llvm-project/pull/101987 - https://github.com/llvm/llvm-project/pull/101989 --- llvm/include/llvm/IR/IntrinsicsDirectX.td | 1 + .../Target/DirectX/DXILIntrinsicExpansion.cpp | 31 +++ llvm/test/CodeGen/DirectX/sign.ll | 212 ++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 llvm/test/CodeGen/DirectX/sign.ll diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index d2c0859f52a24..f089d51fa1b45 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -81,4 +81,5 @@ def int_dx_rcp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_dx_rsqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; +def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty]>; } diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 2daa4f825c3b2..72fa9891bfd8e 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" @@ -48,6 +49,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_fdot: case Intrinsic::dx_sdot: case Intrinsic::dx_udot: + case Intrinsic::dx_sign: return true; } return false; @@ -359,6 +361,32 @@ static Value *expandClampIntrinsic(CallInst *Orig, {MaxCall, Max}, nullptr, "dx.min"); } +static Value *expandSignIntrinsic(CallInst *Orig) { + Value *X = Orig->getOperand(0); + Type *Ty = X->getType(); + Type *ScalarTy = Ty->getScalarType(); + Type *RetTy = Orig->getType(); + Constant *Zero = Constant::getNullValue(Ty); + + IRBuilder<> Builder(Orig); + + Value *GT; + Value *LT; + if (ScalarTy->isFloatingPointTy()) { + GT = Builder.CreateFCmpOLT(Zero, X); + LT = Builder.CreateFCmpOLT(X, Zero); + } else { + assert(ScalarTy->isIntegerTy()); + GT = Builder.CreateICmpSLT(Zero, X); + LT = Builder.CreateICmpSLT(X, Zero); + } + + Value *ZextGT = Builder.CreateZExt(GT, RetTy); + Value *ZextLT = Builder.CreateZExt(LT, RetTy); + + return Builder.CreateSub(ZextGT, ZextLT); +} + static bool expandIntrinsic(Function &F, CallInst *Orig) { Value *Result = nullptr; Intrinsic::ID IntrinsicId = F.getIntrinsicID(); @@ -402,6 +430,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::dx_udot: Result = expandIntegerDotIntrinsic(Orig, IntrinsicId); break; + case Intrinsic::dx_sign: + Result = expandSignIntrinsic(Orig); + break; } if (Result) { diff --git a/llvm/test/CodeGen/DirectX/sign.ll b/llvm/test/CodeGen/DirectX/sign.ll new file mode 100644 index 0000000000000..0a1631bd5ddc1 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/sign.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + + +define noundef i32 @sign_half(half noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_half( +; CHECK-SAME: half noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt half 0xH0000, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt half [[A]], 0xH0000 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.f16(half %a) + ret i32 %elt.sign +} + +define noundef i32 @sign_float(float noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_float( +; CHECK-SAME: float noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt float 0.000000e+00, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt float [[A]], 0.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.f32(float %a) + ret i32 %elt.sign +} + +define noundef i32 @sign_double(double noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_double( +; CHECK-SAME: double noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt double 0.000000e+00, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt double [[A]], 0.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.f64(double %a) + ret i32 %elt.sign +} + +define noundef i32 @sign_i16(i16 noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_i16( +; CHECK-SAME: i16 noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i16 0, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i16 [[A]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.i16(i16 %a) + ret i32 %elt.sign +} + +define noundef i32 @sign_i32(i32 noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_i32( +; CHECK-SAME: i32 noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i32 0, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[A]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.i32(i32 %a) + ret i32 %elt.sign +} + +define noundef i32 @sign_i64(i64 noundef %a) { +; CHECK-LABEL: define noundef i32 @sign_i64( +; CHECK-SAME: i64 noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i64 0, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[A]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %elt.sign = call i32 @llvm.dx.sign.i64(i64 %a) + ret i32 %elt.sign +} + +define noundef <4 x i32> @sign_half_vector(<4 x half> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_half_vector( +; CHECK-SAME: <4 x half> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x half> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4f16(<4 x half> %a) + ret <4 x i32> %elt.sign +} + +define noundef <4 x i32> @sign_float_vector(<4 x float> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_float_vector( +; CHECK-SAME: <4 x float> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <4 x float> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4f32(<4 x float> %a) + ret <4 x i32> %elt.sign +} + +define noundef <4 x i32> @sign_double_vector(<4 x double> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_double_vector( +; CHECK-SAME: <4 x double> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x double> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <4 x double> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4f64(<4 x double> %a) + ret <4 x i32> %elt.sign +} + +define noundef <4 x i32> @sign_i16_vector(<4 x i16> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_i16_vector( +; CHECK-SAME: <4 x i16> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i16> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i16> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4i16(<4 x i16> %a) + ret <4 x i32> %elt.sign +} + +define noundef <4 x i32> @sign_i32_vector(<4 x i32> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_i32_vector( +; CHECK-SAME: <4 x i32> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4i32(<4 x i32> %a) + ret <4 x i32> %elt.sign +} + +define noundef <4 x i32> @sign_i64_vector(<4 x i64> noundef %a) { +; CHECK-LABEL: define noundef <4 x i32> @sign_i64_vector( +; CHECK-SAME: <4 x i64> noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i64> zeroinitializer, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i64> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; +entry: + %elt.sign = call <4 x i32> @llvm.dx.sign.v4i64(<4 x i64> %a) + ret <4 x i32> %elt.sign +} + + +declare i32 @llvm.dx.sign.f16(half) +declare i32 @llvm.dx.sign.f32(float) +declare i32 @llvm.dx.sign.f64(double) + +declare i32 @llvm.dx.sign.i16(i16) +declare i32 @llvm.dx.sign.i32(i32) +declare i32 @llvm.dx.sign.i64(i64) + +declare <4 x i32> @llvm.dx.sign.v4f16(<4 x half>) +declare <4 x i32> @llvm.dx.sign.v4f32(<4 x float>) +declare <4 x i32> @llvm.dx.sign.v4f64(<4 x double>) + +declare <4 x i32> @llvm.dx.sign.v4i16(<4 x i16>) +declare <4 x i32> @llvm.dx.sign.v4i32(<4 x i32>) +declare <4 x i32> @llvm.dx.sign.v4i64(<4 x i64>) From 0818c2801ecc5cb07b680bb77e24df90f35c74b9 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 5 Sep 2024 09:47:26 -0700 Subject: [PATCH 258/425] [WebAssembly] Simplify a switch-case in CFGStackify (NFC) (#107360) This merges some `case`s using `[[fallthrough]]`, and make `DELEGATE` as a separate `case`. (Previously the reason we didn't do that was not to duplicate the code in `RewriteOperands`. But now that we've extracted it into a lambda function in #107182 we can do it. --- .../WebAssembly/WebAssemblyCFGStackify.cpp | 20 +++++++++---------- .../WebAssembly/WebAssemblyInstrControl.td | 4 ++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 3362ea5316e45..3cccc57e629fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -1681,18 +1681,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { Stack.pop_back(); break; - case WebAssembly::END_BLOCK: - Stack.push_back(std::make_pair(&MBB, &MI)); - break; - case WebAssembly::END_TRY: { - // We handle DELEGATE in the default level, because DELEGATE has - // immediate operands to rewrite. - Stack.push_back(std::make_pair(&MBB, &MI)); auto *EHPad = TryToEHPad[EndToBegin[&MI]]; EHPadStack.push_back(EHPad); - break; + [[fallthrough]]; } + case WebAssembly::END_BLOCK: + Stack.push_back(std::make_pair(&MBB, &MI)); + break; case WebAssembly::END_LOOP: Stack.push_back(std::make_pair(EndToBegin[&MI]->getParent(), &MI)); @@ -1707,12 +1703,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { MI.getOperand(0).setImm(getRethrowDepth(Stack, EHPadStack)); break; + case WebAssembly::DELEGATE: + RewriteOperands(MI); + Stack.push_back(std::make_pair(&MBB, &MI)); + break; + default: if (MI.isTerminator()) RewriteOperands(MI); - - if (MI.getOpcode() == WebAssembly::DELEGATE) - Stack.push_back(std::make_pair(&MBB, &MI)); break; } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index dd40015577fd7..05880b89d1fbc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -147,9 +147,9 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, // usage gets low enough. // Rethrowing an exception: rethrow -let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>; -} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 +// isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 // The depth argument will be computed in CFGStackify. We set it to 0 here for // now. def : Pat<(int_wasm_rethrow), (RETHROW 0)>; From 18ad98e7947502da0c8f6dcbbf485bb34fe8d204 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Thu, 5 Sep 2024 09:53:49 -0700 Subject: [PATCH 259/425] [lldb] Fix a format string in ClangASTSource (#107325) Without this, LLDB asserts when enabling the expression logs. --- lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp index 1fdd272dcbece..e41efdd3f61c7 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp @@ -293,7 +293,7 @@ void ClangASTSource::CompleteType(clang::ObjCInterfaceDecl *interface_decl) { LLDB_LOG(log, " [CompleteObjCInterfaceDecl] on (ASTContext*){0:x} '{1}' " - "Completing an ObjCInterfaceDecl named {1}", + "Completing an ObjCInterfaceDecl named {2}", m_ast_context, m_clang_ast_context->getDisplayName(), interface_decl->getName()); LLDB_LOG(log, " [COID] Before:\n{0}", From 91a3c6f3d66b866bcda8a0f7d4815bc8f2dbd86c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 5 Sep 2024 17:54:57 +0100 Subject: [PATCH 260/425] [AArch64] Remove redundant COPY from loadRegFromStackSlot (#107396) This removes a redundant 'COPY' instruction that #81716 probably forgot to remove. This redundant COPY led to an issue because because code in LiveRangeSplitting expects that the instruction emitted by `loadRegFromStackSlot` is an instruction that accesses memory, which isn't the case for the COPY instruction. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 4 --- llvm/test/CodeGen/AArch64/spillfill-sve.mir | 37 +++++++++++++++++++- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 70f69f34a4a7d..3b38a5f78dee5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5559,10 +5559,6 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (PNRReg.isValid() && !PNRReg.isVirtual()) MI.addDef(PNRReg, RegState::Implicit); MI.addMemOperand(MMO); - - if (PNRReg.isValid() && PNRReg.isVirtual()) - BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg) - .addReg(DestReg); } bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir index 11cf388e38531..83c9b73c57570 100644 --- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir @@ -11,6 +11,7 @@ define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2mul2() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_pnr() #1 { entry: unreachable } + define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable } @@ -216,7 +217,7 @@ body: | ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7 ; ; EXPAND: renamable $pn8 = LDR_PXI $sp, 7 - ; EXPAND: $p0 = PEXT_PCI_B killed renamable $pn8, 0 + ; EXPAND-NEXT: $p0 = PEXT_PCI_B killed renamable $pn8, 0 %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv @@ -242,6 +243,40 @@ body: | RET_ReallyLR ... --- +name: spills_fills_stack_id_virtreg_ppr_to_pnr +tracksRegLiveness: true +registers: + - { id: 0, class: ppr } + - { id: 1, class: pnr_p8to15 } +stack: +body: | + bb.0.entry: + liveins: $p0 + + %0:ppr = COPY $p0 + + $pn0 = IMPLICIT_DEF + $pn1 = IMPLICIT_DEF + $pn2 = IMPLICIT_DEF + $pn3 = IMPLICIT_DEF + $pn4 = IMPLICIT_DEF + $pn5 = IMPLICIT_DEF + $pn6 = IMPLICIT_DEF + $pn7 = IMPLICIT_DEF + $pn8 = IMPLICIT_DEF + $pn9 = IMPLICIT_DEF + $pn10 = IMPLICIT_DEF + $pn11 = IMPLICIT_DEF + $pn12 = IMPLICIT_DEF + $pn13 = IMPLICIT_DEF + $pn14 = IMPLICIT_DEF + $pn15 = IMPLICIT_DEF + + %1:pnr_p8to15 = COPY %0 + $p0 = PEXT_PCI_B %1, 0 + RET_ReallyLR +... +--- name: spills_fills_stack_id_zpr tracksRegLiveness: true registers: From 54194e1506bdd6dc37988678a8047ad4d48168fa Mon Sep 17 00:00:00 2001 From: Michal Terepeta Date: Thu, 5 Sep 2024 19:01:29 +0200 Subject: [PATCH 261/425] [RISCV][SiFive7] Change `Latency` of VCIX to the default (#106497) Currently we multiply the default (`SiFive7GetCyclesDefault`) by 10, but this turns out to be both surprising to our users and leads to worse codegen in most cases. I think it's more natural to just keep the default. In the end the right solution is probably to have a separate scheduling model for a particular VCIX coprocessor. --- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3f2e8dee76fd6..24cbe1531c017 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -961,15 +961,18 @@ def : InstRW<[WriteIALU], (instrs COPY)>; // VCIX // -// In principle we don't know the latency of any VCIX instructions. But instead -// of taking the default of 1, which can lead to issues [1], we assume that they -// have a fairly high latency. +// In principle we don't know the latency of any VCIX instructions (they +// depends on a particular coprocessor implementation). However, the default +// latency of 1 can lead to issues [1]. So instead we set the latency to the +// default provided by `SiFive7GetCyclesDefault`. This is still not accurate +// and can lead to suboptimal codegen, but should hopefully be a better +// starting point. // // [1] https://github.com/llvm/llvm-project/issues/83391 foreach mx = SchedMxList in { defvar Cycles = SiFive7GetCyclesDefault.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; - let Latency = !mul(Cycles, 10), + let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm "" : LMULWriteResMX<"WriteVC_V_I", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; From cf2ecc7c1c24dee6e3b70a836474a5ac553829b3 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 5 Sep 2024 18:20:16 +0100 Subject: [PATCH 262/425] [LV] Remove over-aggressive assert from 3fe6a064f15c. There are some cases where only the first operand is marked for truncation. In that case, the compare won't be truncated which would incorrectly trigger the assertion. It also shows that the check pre 3fe6a064f15c also considered compares truncated that cannot be truncated. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- .../truncate-to-minimal-bitwidth-cost.ll | 89 +++++++++++++++++++ 2 files changed, 91 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0ccf442dac999..780da36189c36 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6607,12 +6607,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); - Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); - (void)Op0AsInstruction; - assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || - canTruncateToMinimalBitwidth(I, VF)) && - "truncating Op0 must imply truncating the compare"); if (canTruncateToMinimalBitwidth(I, VF)) { + Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); + (void)Op0AsInstruction; assert(!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || MinBWs[I] == MinBWs[Op0AsInstruction] && "if both the operand and the compare are marked for " diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 9fe5a2a6a3ecc..a2c3f2bfa274c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -269,6 +269,93 @@ exit: ret i8 %trunc } +define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 %v, ptr noalias %src) #1 { +; CHECK-LABEL: define void @icmp_only_first_op_truncated( +; CHECK-SAME: ptr noalias [[DST:%.*]], i32 [[X:%.*]], i64 [[N:%.*]], i64 [[V:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[T:%.*]] = trunc i64 [[N]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[V]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[T]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = trunc [[BROADCAST_SPLAT]] to +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq [[TMP7]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[BROADCAST_SPLAT4]], i32 8, [[TMP8]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[T1:%.*]] = trunc i64 [[N]] to i32 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[T1]], [[T]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IDXPROM]] +; CHECK-NEXT: [[RETVAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: store double [[RETVAL]], ptr [[DST]], align 8 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %t = trunc i64 %N to i32 + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %t1 = trunc i64 %N to i32 + %c = icmp eq i32 %t1, %t + br i1 %c, label %then, label %loop.latch + +then: + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr double, ptr %src, i64 %idxprom + %retval = load double, ptr %arrayidx, align 8 + store double %retval, ptr %dst, align 8 + br label %loop.latch + +loop.latch: + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %v + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } attributes #1 = { "target-features"="+64bit,+v" } @@ -283,4 +370,6 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ;. From 311ac6381649fa0f7cc495db8fa697d6a9b43988 Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Thu, 5 Sep 2024 13:25:06 -0400 Subject: [PATCH 263/425] [NFC][SystemZ][z/OS] Rename autoconversion-related functions to be less generic (#107399) This patch renames the functions in AutoConvert.h/cpp to have a less generic name because they are z/OS specific. --- clang/tools/c-arcmt-test/c-arcmt-test.c | 4 ++-- clang/tools/c-index-test/c-index-test.c | 4 ++-- llvm/include/llvm/Support/AutoConvert.h | 14 +++++++------- llvm/lib/Support/AutoConvert.cpp | 20 ++++++++++---------- llvm/lib/Support/InitLLVM.cpp | 10 +++++----- llvm/lib/Support/MemoryBuffer.cpp | 2 +- llvm/lib/Support/Unix/Path.inc | 8 ++++---- llvm/lib/Support/Unix/Program.inc | 2 +- llvm/lib/Support/raw_ostream.cpp | 4 ++-- llvm/utils/count/count.c | 4 ++-- 10 files changed, 36 insertions(+), 36 deletions(-) diff --git a/clang/tools/c-arcmt-test/c-arcmt-test.c b/clang/tools/c-arcmt-test/c-arcmt-test.c index 00999f188c7dc..4d0c418714b95 100644 --- a/clang/tools/c-arcmt-test/c-arcmt-test.c +++ b/clang/tools/c-arcmt-test/c-arcmt-test.c @@ -109,10 +109,10 @@ static void flush_atexit(void) { int main(int argc, const char **argv) { #ifdef __MVS__ - if (enableAutoConversion(fileno(stdout)) == -1) + if (enablezOSAutoConversion(fileno(stdout)) == -1) fprintf(stderr, "Setting conversion on stdout failed\n"); - if (enableAutoConversion(fileno(stderr)) == -1) + if (enablezOSAutoConversion(fileno(stderr)) == -1) fprintf(stderr, "Setting conversion on stderr failed\n"); #endif diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index f472a67f3bc5b..b48f44950ab75 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -5180,10 +5180,10 @@ int main(int argc, const char **argv) { thread_info client_data; #ifdef __MVS__ - if (enableAutoConversion(fileno(stdout)) == -1) + if (enablezOSAutoConversion(fileno(stdout)) == -1) fprintf(stderr, "Setting conversion on stdout failed\n"); - if (enableAutoConversion(fileno(stderr)) == -1) + if (enablezOSAutoConversion(fileno(stderr)) == -1) fprintf(stderr, "Setting conversion on stderr failed\n"); #endif diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index f4fe80b22a876..6f45c4683f777 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -27,9 +27,9 @@ #ifdef __cplusplus extern "C" { #endif // __cplusplus -int enableAutoConversion(int FD); -int disableAutoConversion(int FD); -int restoreStdHandleAutoConversion(int FD); +int enablezOSAutoConversion(int FD); +int disablezOSAutoConversion(int FD); +int restorezOSStdHandleAutoConversion(int FD); #ifdef __cplusplus } #endif // __cplusplus @@ -39,18 +39,18 @@ namespace llvm { /// \brief Disable the z/OS enhanced ASCII auto-conversion for the file /// descriptor. -std::error_code disableAutoConversion(int FD); +std::error_code disablezOSAutoConversion(int FD); /// \brief Query the z/OS enhanced ASCII auto-conversion status of a file /// descriptor and force the conversion if the file is not tagged with a /// codepage. -std::error_code enableAutoConversion(int FD); +std::error_code enablezOSAutoConversion(int FD); /// Restore the z/OS enhanced ASCII auto-conversion for the std handle. -std::error_code restoreStdHandleAutoConversion(int FD); +std::error_code restorezOSStdHandleAutoConversion(int FD); /// \brief Set the tag information for a file descriptor. -std::error_code setFileTag(int FD, int CCSID, bool Text); +std::error_code setzOSFileTag(int FD, int CCSID, bool Text); } // namespace llvm #endif // __cplusplus diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index c509284ee916e..66570735f8fc8 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -22,7 +22,7 @@ static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1}; -int disableAutoConversion(int FD) { +int disablezOSAutoConversion(int FD) { static const struct f_cnvrt Convert = { SETCVTOFF, // cvtcmd 0, // pccsid @@ -32,7 +32,7 @@ int disableAutoConversion(int FD) { return fcntl(FD, F_CONTROL_CVT, &Convert); } -int restoreStdHandleAutoConversion(int FD) { +int restorezOSStdHandleAutoConversion(int FD) { assert(FD == STDIN_FILENO || FD == STDOUT_FILENO || FD == STDERR_FILENO); if (savedStdHandleAutoConversionMode[FD] == -1) return 0; @@ -44,7 +44,7 @@ int restoreStdHandleAutoConversion(int FD) { return (fcntl(FD, F_CONTROL_CVT, &Cvt)); } -int enableAutoConversion(int FD) { +int enablezOSAutoConversion(int FD) { struct f_cnvrt Query = { QUERYCVT, // cvtcmd 0, // pccsid @@ -81,28 +81,28 @@ int enableAutoConversion(int FD) { return fcntl(FD, F_CONTROL_CVT, &Query); } -std::error_code llvm::disableAutoConversion(int FD) { - if (::disableAutoConversion(FD) == -1) +std::error_code llvm::disablezOSAutoConversion(int FD) { + if (::disablezOSAutoConversion(FD) == -1) return errnoAsErrorCode(); return std::error_code(); } -std::error_code llvm::enableAutoConversion(int FD) { - if (::enableAutoConversion(FD) == -1) +std::error_code llvm::enablezOSAutoConversion(int FD) { + if (::enablezOSAutoConversion(FD) == -1) return errnoAsErrorCode(); return std::error_code(); } -std::error_code llvm::restoreStdHandleAutoConversion(int FD) { - if (::restoreStdHandleAutoConversion(FD) == -1) +std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) { + if (::restorezOSStdHandleAutoConversion(FD) == -1) return errnoAsErrorCode(); return std::error_code(); } -std::error_code llvm::setFileTag(int FD, int CCSID, bool Text) { +std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) && "FT_UNTAGGED and FT_BINARY are not allowed for text files"); struct file_tag Tag; diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp index b7e463a19122d..2b3ddd39b0879 100644 --- a/llvm/lib/Support/InitLLVM.cpp +++ b/llvm/lib/Support/InitLLVM.cpp @@ -27,9 +27,9 @@ void CleanupStdHandles(void *Cookie) { llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs(); Outs->flush(); Errs->flush(); - llvm::restoreStdHandleAutoConversion(STDIN_FILENO); - llvm::restoreStdHandleAutoConversion(STDOUT_FILENO); - llvm::restoreStdHandleAutoConversion(STDERR_FILENO); + llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO); + llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO); + llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO); } #endif @@ -70,8 +70,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, // If turning on conversion for stderr fails then the error message // may be garbled. There is no solution to this problem. - ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO))); - ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO))); + ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO))); + ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO))); #endif #ifdef _WIN32 diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index fb7e804fd7e84..aea81964ba9fd 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -505,7 +505,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, #ifdef __MVS__ // Set codepage auto-conversion for z/OS. - if (auto EC = llvm::enableAutoConversion(FD)) + if (auto EC = llvm::enablezOSAutoConversion(FD)) return EC; #endif diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index cf05db546e021..8098392a7fd90 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1096,17 +1096,17 @@ std::error_code openFile(const Twine &Name, int &ResultFD, !Stat.st_tag.ft_txtflag && !Stat.st_tag.ft_ccsid && Stat.st_size == 0; if (Flags & OF_Text) { - if (auto EC = llvm::enableAutoConversion(ResultFD)) + if (auto EC = llvm::enablezOSAutoConversion(ResultFD)) return EC; if (DoSetTag) { - if (auto EC = llvm::setFileTag(ResultFD, CCSID_IBM_1047, true)) + if (auto EC = llvm::setzOSFileTag(ResultFD, CCSID_IBM_1047, true)) return EC; } } else { - if (auto EC = llvm::disableAutoConversion(ResultFD)) + if (auto EC = llvm::disablezOSAutoConversion(ResultFD)) return EC; if (DoSetTag) { - if (auto EC = llvm::setFileTag(ResultFD, FT_BINARY, false)) + if (auto EC = llvm::setzOSFileTag(ResultFD, FT_BINARY, false)) return EC; } } diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc index 2742734bb11ed..ec0fad7076b45 100644 --- a/llvm/lib/Support/Unix/Program.inc +++ b/llvm/lib/Support/Unix/Program.inc @@ -533,7 +533,7 @@ std::error_code llvm::sys::ChangeStdoutMode(fs::OpenFlags Flags) { std::error_code llvm::sys::ChangeStdinToBinary() { #ifdef __MVS__ - return disableAutoConversion(STDIN_FILENO); + return disablezOSAutoConversion(STDIN_FILENO); #else // Do nothing, as Unix doesn't differentiate between text and binary. return std::error_code(); diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 2ce54faa9857e..48f04091e9e39 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -894,7 +894,7 @@ raw_fd_ostream &llvm::outs() { // Set buffer settings to model stdout behavior. std::error_code EC; #ifdef __MVS__ - EC = enableAutoConversion(STDOUT_FILENO); + EC = enablezOSAutoConversion(STDOUT_FILENO); assert(!EC); #endif static raw_fd_ostream S("-", EC, sys::fs::OF_None); @@ -905,7 +905,7 @@ raw_fd_ostream &llvm::outs() { raw_fd_ostream &llvm::errs() { // Set standard error to be unbuffered. #ifdef __MVS__ - std::error_code EC = enableAutoConversion(STDERR_FILENO); + std::error_code EC = enablezOSAutoConversion(STDERR_FILENO); assert(!EC); #endif static raw_fd_ostream S(STDERR_FILENO, false, true); diff --git a/llvm/utils/count/count.c b/llvm/utils/count/count.c index 300be2aa8a18e..9166145fcc10a 100644 --- a/llvm/utils/count/count.c +++ b/llvm/utils/count/count.c @@ -12,10 +12,10 @@ int main(int argc, char **argv) { #ifdef __MVS__ - if (enableAutoConversion(fileno(stdin)) == -1) + if (enablezOSAutoConversion(fileno(stdin)) == -1) fprintf(stderr, "Setting conversion on stdin failed\n"); - if (enableAutoConversion(fileno(stderr)) == -1) + if (enablezOSAutoConversion(fileno(stderr)) == -1) fprintf(stdout, "Setting conversion on stderr failed\n"); #endif size_t Count, NumLines, NumRead; From 5e25291b3c50873dbd0e2b3939b113bcff691460 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 5 Sep 2024 10:35:02 -0700 Subject: [PATCH 264/425] [SandboxIR][Bench] Initial patch for performance tracking (#107296) This patch adds a new benchmark suite for SandboxIR. It measures the performance of some of the most commonly used API functions and compares it against LLVM IR. --- llvm/benchmarks/CMakeLists.txt | 4 + llvm/benchmarks/SandboxIRBench.cpp | 115 +++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 llvm/benchmarks/SandboxIRBench.cpp diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index aa0cb77773344..1078efa55f497 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -1,5 +1,7 @@ set(LLVM_LINK_COMPONENTS + AsmParser Core + SandboxIR Support) add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) @@ -7,3 +9,5 @@ add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED) +add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED) + diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp new file mode 100644 index 0000000000000..633de6db1f5e2 --- /dev/null +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -0,0 +1,115 @@ +//===- SandboxIRBench.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// These tests measure the performance of some core SandboxIR functions and +// compare them against LLVM IR. +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/Support/SourceMgr.h" +#include + +using namespace llvm; + +static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + std::unique_ptr M = parseAssemblyString(IR, Err, C); + if (!M) + Err.print("SandboxIRBench", errs()); + return M; +} + +enum class IR { + LLVM, + SBox, +}; +// Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox. +template struct TypeSelect {}; +template <> struct TypeSelect { + using BasicBlock = llvm::BasicBlock; +}; +template <> struct TypeSelect { + using BasicBlock = sandboxir::BasicBlock; +}; + +template +static typename TypeSelect::BasicBlock * +genIR(std::unique_ptr &LLVMM, LLVMContext &LLVMCtx, + sandboxir::Context &Ctx, + std::function GenerateIRStr, + unsigned NumInstrs = 0u) { + std::string IRStr = GenerateIRStr(NumInstrs); + LLVMM = parseIR(LLVMCtx, IRStr.c_str()); + llvm::Function *LLVMF = &*LLVMM->getFunction("foo"); + llvm::BasicBlock *LLVMBB = &*LLVMF->begin(); + + sandboxir::Function *F = Ctx.createFunction(LLVMF); + sandboxir::BasicBlock *BB = &*F->begin(); + if constexpr (IRTy == IR::LLVM) + return LLVMBB; + else + return BB; +} + +static std::string generateBBWalkIR(unsigned Size) { + std::stringstream SS; + SS << "define void @foo(i32 %v1, i32 %v2) {\n"; + for (auto Cnt : seq(0, Size)) + SS << " %add" << Cnt << " = add i32 %v1, %v2\n"; + SS << "ret void"; + SS << "}"; + return SS.str(); +} + +template static void BBWalk(benchmark::State &State) { + LLVMContext LLVMCtx; + sandboxir::Context Ctx(LLVMCtx); + unsigned NumInstrs = State.range(0); + std::unique_ptr LLVMM; + auto *BB = genIR(LLVMM, LLVMCtx, Ctx, generateBBWalkIR, NumInstrs); + for (auto _ : State) { + // Walk LLVM Instructions. + for (auto &I : *BB) + benchmark::DoNotOptimize(I); + } +} + +static std::string generateGetTypeIR(unsigned Size) { + return R"IR( +define void @foo(i32 %v1, i32 %v2) { + %add = add i32 %v1, %v2 + ret void +} +)IR"; +} + +template static void GetType(benchmark::State &State) { + LLVMContext LLVMCtx; + sandboxir::Context Ctx(LLVMCtx); + std::unique_ptr LLVMM; + auto *BB = genIR(LLVMM, LLVMCtx, Ctx, generateGetTypeIR); + auto *I = &*BB->begin(); + for (auto _ : State) + benchmark::DoNotOptimize(I->getType()); +} + +BENCHMARK(GetType); +BENCHMARK(GetType); + +BENCHMARK(BBWalk)->Args({1024}); +BENCHMARK(BBWalk)->Args({1024}); + +BENCHMARK_MAIN(); From 3815f478bb4f1c724d36044a4e0bbd3352313322 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Thu, 5 Sep 2024 19:40:58 +0200 Subject: [PATCH 265/425] [mlir][Transforms] Dialect conversion: Make materializations optional (#107109) This commit makes source/target/argument materializations (via the `TypeConverter` API) optional. By default (`ConversionConfig::buildMaterializations = true`), the dialect conversion infrastructure tries to legalize all unresolved materializations right after the main transformation process has succeeded. If at least one unresolved materialization fails to resolve, the dialect conversion fails. (With an error message such as `failed to legalize unresolved materialization ...`.) Automatic materializations through the `TypeConverter` API can now be deactivated. In that case, every unresolved materialization will show up as a `builtin.unrealized_conversion_cast` op in the output IR. There used to be a complex and error-prone analysis in the dialect conversion that predicted the future uses of unresolved materializations. Based on that logic, some casts (that were deemed to unnecessary) were folded. This analysis was needed because folding happened at a point of time when some IR changes (e.g., op replacements) had not materialized yet. This commit removes that analysis. Any folding of cast ops now happens after all other IR changes have been materialized and the uses can directly be queried from the IR. This simplifies the analysis significantly. And certain helper data structures such as `inverseMapping` are no longer needed for the analysis. The folding itself is done by `reconcileUnrealizedCasts` (which also exists as a standalone pass). After casts have been folded, the remaining casts are materialized through the `TypeConverter`, as usual. This last step can be deactivated in the `ConversionConfig`. `ConversionConfig::buildMaterializations = false` can be used to debug error messages such as `failed to legalize unresolved materialization ...`. (It is also useful in case automatic materializations are not needed.) The materializations that failed to resolve can then be seen as `builtin.unrealized_conversion_cast` ops in the resulting IR. (This is better than running with `-debug`, because `-debug` shows IR where some IR changes have not been materialized yet.) Note: This is a reupload of #104668, but with correct handling of cyclic unrealized_conversion_casts that may be generated by the dialect conversion. --- .../mlir/Transforms/DialectConversion.h | 11 + .../Transforms/Utils/DialectConversion.cpp | 466 +++++++----------- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 5 +- .../Transforms/finalizing-bufferize.mlir | 1 + .../test-legalize-type-conversion.mlir | 6 +- mlir/test/Transforms/test-legalizer.mlir | 11 + mlir/test/lib/Dialect/Test/TestPatterns.cpp | 34 +- 7 files changed, 234 insertions(+), 300 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 60113bdef16a2..5f680e8eca755 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1124,6 +1124,17 @@ struct ConversionConfig { // already been modified) and iterators into past IR state cannot be // represented at the moment. RewriterBase::Listener *listener = nullptr; + + /// If set to "true", the dialect conversion attempts to build source/target/ + /// argument materializations through the type converter API in lieu of + /// builtin.unrealized_conversion_cast ops. The conversion process fails if + /// at least one materialization could not be built. + /// + /// If set to "false", the dialect conversion does not does not build any + /// custom materializations and instead inserts + /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR + /// is valid. + bool buildMaterializations = true; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index b23fb97959ed6..450e66f0db4e7 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -702,14 +702,12 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { return rewrite->getKind() == Kind::UnresolvedMaterialization; } + void rollback() override; + UnrealizedConversionCastOp getOperation() const { return cast(op); } - void rollback() override; - - void cleanup(RewriterBase &rewriter) override; - /// Return the type converter of this materialization (which may be null). const TypeConverter *getConverter() const { return converterAndKind.getPointer(); @@ -766,7 +764,7 @@ namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(MLIRContext *ctx, const ConversionConfig &config) - : context(ctx), config(config) {} + : context(ctx), eraseRewriter(ctx), config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -834,6 +832,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { //===--------------------------------------------------------------------===// // Materializations //===--------------------------------------------------------------------===// + /// Build an unresolved materialization operation given an output type and set /// of input operands. Value buildUnresolvedMaterialization(MaterializationKind kind, @@ -882,7 +881,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given op (unless it was already erased). void eraseOp(Operation *op) override { - if (erased.contains(op)) + if (wasErased(op)) return; op->dropAllUses(); RewriterBase::eraseOp(op); @@ -890,17 +889,24 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given block (unless it was already erased). void eraseBlock(Block *block) override { - if (erased.contains(block)) + if (wasErased(block)) return; assert(block->empty() && "expected empty block"); block->dropAllDefinedValueUses(); RewriterBase::eraseBlock(block); } + bool wasErased(void *ptr) const { return erased.contains(ptr); } + + bool wasErased(OperationRewrite *rewrite) const { + return wasErased(rewrite->getOperation()); + } + void notifyOperationErased(Operation *op) override { erased.insert(op); } void notifyBlockErased(Block *block) override { erased.insert(block); } + private: /// Pointers to all erased operations and blocks. DenseSet erased; }; @@ -912,6 +918,11 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// MLIR context. MLIRContext *context; + /// A rewriter that keeps track of ops/block that were already erased and + /// skips duplicate op/block erasures. This rewriter is used during the + /// "cleanup" phase. + SingleEraseRewriter eraseRewriter; + // Mapping between replaced values that differ in type. This happens when // replacing a value with one of a different type. ConversionValueMapping mapping; @@ -1058,10 +1069,6 @@ void UnresolvedMaterializationRewrite::rollback() { op->erase(); } -void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) { - rewriter.eraseOp(op); -} - void ConversionPatternRewriterImpl::applyRewrites() { // Commit all rewrites. IRRewriter rewriter(context, config.listener); @@ -1069,7 +1076,6 @@ void ConversionPatternRewriterImpl::applyRewrites() { rewrite->commit(rewriter); // Clean up all rewrites. - SingleEraseRewriter eraseRewriter(context); for (auto &rewrite : rewrites) rewrite->cleanup(eraseRewriter); } @@ -2353,12 +2359,6 @@ struct OperationConverter { ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping); - /// Legalize any unresolved type materializations. - LogicalResult legalizeUnresolvedMaterializations( - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping); - /// Legalize an operation result that was marked as "erased". LogicalResult legalizeErasedResult(Operation *op, OpResult result, @@ -2405,6 +2405,128 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, return success(); } +static LogicalResult +legalizeUnresolvedMaterialization(RewriterBase &rewriter, + UnresolvedMaterializationRewrite *rewrite) { + UnrealizedConversionCastOp op = rewrite->getOperation(); + assert(!op.use_empty() && + "expected that dead materializations have already been DCE'd"); + Operation::operand_range inputOperands = op.getOperands(); + Type outputType = op.getResultTypes()[0]; + + // Try to materialize the conversion. + if (const TypeConverter *converter = rewrite->getConverter()) { + rewriter.setInsertionPoint(op); + Value newMaterialization; + switch (rewrite->getMaterializationKind()) { + case MaterializationKind::Argument: + // Try to materialize an argument conversion. + newMaterialization = converter->materializeArgumentConversion( + rewriter, op->getLoc(), outputType, inputOperands); + if (newMaterialization) + break; + // If an argument materialization failed, fallback to trying a target + // materialization. + [[fallthrough]]; + case MaterializationKind::Target: + newMaterialization = converter->materializeTargetConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + case MaterializationKind::Source: + newMaterialization = converter->materializeSourceConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + } + if (newMaterialization) { + assert(newMaterialization.getType() == outputType && + "materialization callback produced value of incorrect type"); + rewriter.replaceOp(op, newMaterialization); + return success(); + } + } + + InFlightDiagnostic diag = op->emitError() + << "failed to legalize unresolved materialization " + "from (" + << inputOperands.getTypes() << ") to " << outputType + << " that remained live after conversion"; + diag.attachNote(op->getUsers().begin()->getLoc()) + << "see existing live user here: " << *op->getUsers().begin(); + return failure(); +} + +/// Erase all dead unrealized_conversion_cast ops. An op is dead if its results +/// are not used (transitively) by any op that is not in the given list of +/// cast ops. +/// +/// In particular, this function erases cyclic casts that may be inserted +/// during the dialect conversion process. E.g.: +/// %0 = unrealized_conversion_cast(%1) +/// %1 = unrealized_conversion_cast(%0) +// Note: This step will become unnecessary when +// https://github.com/llvm/llvm-project/pull/106760 has been merged. +static void eraseDeadUnrealizedCasts( + ArrayRef castOps, + SmallVectorImpl *remainingCastOps) { + // Ops that have already been visited or are currently being visited. + DenseSet visited; + // Set of all cast ops for faster lookups. + DenseSet castOpSet; + // Set of all cast ops that have been determined to be alive. + DenseSet live; + + for (UnrealizedConversionCastOp op : castOps) + castOpSet.insert(op); + + // Visit a cast operation. Return "true" if the operation is live. + std::function visit = [&](Operation *op) -> bool { + // No need to traverse any IR if the op was already marked as live. + if (live.contains(op)) + return true; + + // Do not visit ops multiple times. If we find a circle, no live user was + // found on the current path. + if (visited.contains(op)) + return false; + visited.insert(op); + + // Visit all users. + for (Operation *user : op->getUsers()) { + // If the user is not an unrealized_conversion_cast op, then the given op + // is live. + if (!castOpSet.contains(user)) { + live.insert(op); + return true; + } + // Otherwise, it is live if a live op can be reached from one of its + // users (which must all be unrealized_conversion_cast ops). + if (visit(user)) { + live.insert(op); + return true; + } + } + + return false; + }; + + // Visit all cast ops. + for (UnrealizedConversionCastOp op : castOps) { + visit(op); + visited.clear(); + } + + // Erase all cast ops that are dead. + for (UnrealizedConversionCastOp op : castOps) { + if (live.contains(op)) { + if (remainingCastOps) + remainingCastOps->push_back(op); + continue; + } + op->dropAllUses(); + op->erase(); + } +} + LogicalResult OperationConverter::convertOperations(ArrayRef ops) { if (ops.empty()) return success(); @@ -2446,6 +2568,38 @@ LogicalResult OperationConverter::convertOperations(ArrayRef ops) { } else { rewriterImpl.applyRewrites(); } + + // Gather all unresolved materializations. + SmallVector allCastOps; + DenseMap rewriteMap; + for (std::unique_ptr &rewrite : rewriterImpl.rewrites) { + auto *mat = dyn_cast(rewrite.get()); + if (!mat) + continue; + if (rewriterImpl.eraseRewriter.wasErased(mat)) + continue; + allCastOps.push_back(mat->getOperation()); + rewriteMap[mat->getOperation()] = mat; + } + + // Reconcile all UnrealizedConversionCastOps that were inserted by the + // dialect conversion frameworks. (Not the one that were inserted by + // patterns.) + SmallVector remainingCastOps1, remainingCastOps2; + eraseDeadUnrealizedCasts(allCastOps, &remainingCastOps1); + reconcileUnrealizedCasts(remainingCastOps1, &remainingCastOps2); + + // Try to legalize all unresolved materializations. + if (config.buildMaterializations) { + IRRewriter rewriter(rewriterImpl.context, config.listener); + for (UnrealizedConversionCastOp castOp : remainingCastOps2) { + auto it = rewriteMap.find(castOp.getOperation()); + assert(it != rewriteMap.end() && "inconsistent state"); + if (failed(legalizeUnresolvedMaterialization(rewriter, it->second))) + return failure(); + } + } + return success(); } @@ -2459,9 +2613,6 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl, inverseMapping))) return failure(); - if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl, - inverseMapping))) - return failure(); return success(); } @@ -2577,279 +2728,6 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( return success(); } -/// Replace the results of a materialization operation with the given values. -static void -replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl, - ResultRange matResults, ValueRange values, - DenseMap> &inverseMapping) { - matResults.replaceAllUsesWith(values); - - // For each of the materialization results, update the inverse mappings to - // point to the replacement values. - for (auto [matResult, newValue] : llvm::zip(matResults, values)) { - auto inverseMapIt = inverseMapping.find(matResult); - if (inverseMapIt == inverseMapping.end()) - continue; - - // Update the reverse mapping, or remove the mapping if we couldn't update - // it. Not being able to update signals that the mapping would have become - // circular (i.e. %foo -> newValue -> %foo), which may occur as values are - // propagated through temporary materializations. We simply drop the - // mapping, and let the post-conversion replacement logic handle updating - // uses. - for (Value inverseMapVal : inverseMapIt->second) - if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue)) - rewriterImpl.mapping.erase(inverseMapVal); - } -} - -/// Compute all of the unresolved materializations that will persist beyond the -/// conversion process, and require inserting a proper user materialization for. -static void computeNecessaryMaterializations( - DenseMap - &materializationOps, - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping, - SetVector &necessaryMaterializations) { - // Helper function to check if the given value or a not yet materialized - // replacement of the given value is live. - // Note: `inverseMapping` maps from replaced values to original values. - auto isLive = [&](Value value) { - auto findFn = [&](Operation *user) { - auto matIt = materializationOps.find(user); - if (matIt != materializationOps.end()) - return !necessaryMaterializations.count(matIt->second); - return rewriterImpl.isOpIgnored(user); - }; - // A worklist is needed because a value may have gone through a chain of - // replacements and each of the replaced values may have live users. - SmallVector worklist; - worklist.push_back(value); - while (!worklist.empty()) { - Value next = worklist.pop_back_val(); - if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end()) - return true; - // This value may be replacing another value that has a live user. - llvm::append_range(worklist, inverseMapping.lookup(next)); - } - return false; - }; - - llvm::unique_function lookupRemappedValue = - [&](Value invalidRoot, Value value, Type type) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); - if (remappedValue.getType() == type && remappedValue != invalidRoot) - return remappedValue; - - // Check to see if the input is a materialization operation that - // provides an inverse conversion. We just check blindly for - // UnrealizedConversionCastOp here, but it has no effect on correctness. - auto inputCastOp = value.getDefiningOp(); - if (inputCastOp && inputCastOp->getNumOperands() == 1) - return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0), - type); - - return Value(); - }; - - SetVector worklist; - for (auto &rewrite : rewriterImpl.rewrites) { - auto *mat = dyn_cast(rewrite.get()); - if (!mat) - continue; - materializationOps.try_emplace(mat->getOperation(), mat); - worklist.insert(mat); - } - while (!worklist.empty()) { - UnresolvedMaterializationRewrite *mat = worklist.pop_back_val(); - UnrealizedConversionCastOp op = mat->getOperation(); - - // We currently only handle target materializations here. - assert(op->getNumResults() == 1 && "unexpected materialization type"); - OpResult opResult = op->getOpResult(0); - Type outputType = opResult.getType(); - Operation::operand_range inputOperands = op.getOperands(); - - // Try to forward propagate operands for user conversion casts that result - // in the input types of the current cast. - for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) { - auto castOp = dyn_cast(user); - if (!castOp) - continue; - if (castOp->getResultTypes() == inputOperands.getTypes()) { - replaceMaterialization(rewriterImpl, user->getResults(), inputOperands, - inverseMapping); - necessaryMaterializations.remove(materializationOps.lookup(user)); - } - } - - // Try to avoid materializing a resolved materialization if possible. - // Handle the case of a 1-1 materialization. - if (inputOperands.size() == 1) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = - lookupRemappedValue(opResult, inputOperands[0], outputType); - if (remappedValue && remappedValue != opResult) { - replaceMaterialization(rewriterImpl, opResult, remappedValue, - inverseMapping); - necessaryMaterializations.remove(mat); - continue; - } - } else { - // TODO: Avoid materializing other types of conversions here. - } - - // If the materialization does not have any live users, we don't need to - // generate a user materialization for it. - bool isMaterializationLive = isLive(opResult); - if (!isMaterializationLive) - continue; - if (!necessaryMaterializations.insert(mat)) - continue; - - // Reprocess input materializations to see if they have an updated status. - for (Value input : inputOperands) { - if (auto parentOp = input.getDefiningOp()) { - if (auto *mat = materializationOps.lookup(parentOp)) - worklist.insert(mat); - } - } - } -} - -/// Legalize the given unresolved materialization. Returns success if the -/// materialization was legalized, failure otherise. -static LogicalResult legalizeUnresolvedMaterialization( - UnresolvedMaterializationRewrite &mat, - DenseMap - &materializationOps, - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping) { - auto findLiveUser = [&](auto &&users) { - auto liveUserIt = llvm::find_if_not( - users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); }); - return liveUserIt == users.end() ? nullptr : *liveUserIt; - }; - - llvm::unique_function lookupRemappedValue = - [&](Value value, Type type) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); - if (remappedValue.getType() == type) - return remappedValue; - return Value(); - }; - - UnrealizedConversionCastOp op = mat.getOperation(); - if (!rewriterImpl.ignoredOps.insert(op)) - return success(); - - // We currently only handle target materializations here. - OpResult opResult = op->getOpResult(0); - Operation::operand_range inputOperands = op.getOperands(); - Type outputType = opResult.getType(); - - // If any input to this materialization is another materialization, resolve - // the input first. - for (Value value : op->getOperands()) { - auto valueCast = value.getDefiningOp(); - if (!valueCast) - continue; - - auto matIt = materializationOps.find(valueCast); - if (matIt != materializationOps.end()) - if (failed(legalizeUnresolvedMaterialization( - *matIt->second, materializationOps, rewriter, rewriterImpl, - inverseMapping))) - return failure(); - } - - // Perform a last ditch attempt to avoid materializing a resolved - // materialization if possible. - // Handle the case of a 1-1 materialization. - if (inputOperands.size() == 1) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = lookupRemappedValue(inputOperands[0], outputType); - if (remappedValue && remappedValue != opResult) { - replaceMaterialization(rewriterImpl, opResult, remappedValue, - inverseMapping); - return success(); - } - } else { - // TODO: Avoid materializing other types of conversions here. - } - - // Try to materialize the conversion. - if (const TypeConverter *converter = mat.getConverter()) { - rewriter.setInsertionPoint(op); - Value newMaterialization; - switch (mat.getMaterializationKind()) { - case MaterializationKind::Argument: - // Try to materialize an argument conversion. - newMaterialization = converter->materializeArgumentConversion( - rewriter, op->getLoc(), outputType, inputOperands); - if (newMaterialization) - break; - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; - case MaterializationKind::Target: - newMaterialization = converter->materializeTargetConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - case MaterializationKind::Source: - newMaterialization = converter->materializeSourceConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - } - if (newMaterialization) { - assert(newMaterialization.getType() == outputType && - "materialization callback produced value of incorrect type"); - replaceMaterialization(rewriterImpl, opResult, newMaterialization, - inverseMapping); - return success(); - } - } - - InFlightDiagnostic diag = op->emitError() - << "failed to legalize unresolved materialization " - "from (" - << inputOperands.getTypes() << ") to " << outputType - << " that remained live after conversion"; - if (Operation *liveUser = findLiveUser(op->getUsers())) { - diag.attachNote(liveUser->getLoc()) - << "see existing live user here: " << *liveUser; - } - return failure(); -} - -LogicalResult OperationConverter::legalizeUnresolvedMaterializations( - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping) { - // As an initial step, compute all of the inserted materializations that we - // expect to persist beyond the conversion process. - DenseMap materializationOps; - SetVector necessaryMaterializations; - computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl, - inverseMapping, necessaryMaterializations); - - // Once computed, legalize any necessary materializations. - for (auto *mat : necessaryMaterializations) { - if (failed(legalizeUnresolvedMaterialization( - *mat, materializationOps, rewriter, rewriterImpl, inverseMapping))) - return failure(); - } - return success(); -} - LogicalResult OperationConverter::legalizeErasedResult( Operation *op, OpResult result, ConversionPatternRewriterImpl &rewriterImpl) { diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 156a8a468d5b4..75362378daaaa 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -1286,7 +1286,6 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor> to i64 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor> to i64 -// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> @@ -1299,8 +1298,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: nvvm.wgmma.fence.aligned // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> +// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> +// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvvm.wgmma.mma_async // CHECK: nvvm.wgmma.mma_async // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir index a192434c5accf..ab18ce05e355d 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir @@ -80,6 +80,7 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref) -> memref // expected-error @+1 {{failed to legalize unresolved materialization from ('memref') to 'memref>' that remained live after conversion}} %1 = bufferization.to_memref %0 : memref> + // expected-note @below{{see existing live user here}} return %1 : memref> } diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir index cf2c9f6a8ec44..f130adff42f8c 100644 --- a/mlir/test/Transforms/test-legalize-type-conversion.mlir +++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir @@ -4,6 +4,7 @@ func.func @test_invalid_arg_materialization( // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}} %arg0: i16) { + // expected-note@below{{see existing live user here}} "foo.return"(%arg0) : (i16) -> () } @@ -22,6 +23,7 @@ func.func @test_valid_arg_materialization(%arg0: i64) { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -30,6 +32,7 @@ func.func @test_invalid_result_materialization() { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -49,6 +52,7 @@ func.func @test_transitive_use_materialization() { func.func @test_transitive_use_invalid_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.another_type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -99,9 +103,9 @@ func.func @test_block_argument_not_converted() { func.func @test_signature_conversion_no_converter() { "test.signature_conversion_no_converter"() ({ // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}} - // expected-note@below {{see existing live user here}} ^bb0(%arg0: f32): "test.type_consumer"(%arg0) : (f32) -> () + // expected-note@below{{see existing live user here}} "test.return"(%arg0) : (f32) -> () }) : () -> () return diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index a789ab9a82e19..e5503ee892042 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -452,3 +452,14 @@ func.func @convert_detached_signature() { }) : () -> () "test.return"() : () -> () } + +// ----- + +// CHECK-LABEL: func @circular_mapping() +// CHECK-NEXT: "test.valid"() : () -> () +func.func @circular_mapping() { + // Regression test that used to crash due to circular + // unrealized_conversion_cast ops. + %0 = "test.erase_op"() : () -> (i64) + "test.drop_operands_and_replace_with_valid"(%0) : (i64) -> () +} diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 91dfb2faa80a1..3cbc307835afd 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -907,6 +907,22 @@ struct TestPassthroughInvalidOp : public ConversionPattern { return success(); } }; +/// Replace with valid op, but simply drop the operands. This is used in a +/// regression where we used to generate circular unrealized_conversion_cast +/// ops. +struct TestDropAndReplaceInvalidOp : public ConversionPattern { + TestDropAndReplaceInvalidOp(MLIRContext *ctx, const TypeConverter &converter) + : ConversionPattern(converter, + "test.drop_operands_and_replace_with_valid", 1, ctx) { + } + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + rewriter.replaceOpWithNewOp(op, std::nullopt, ValueRange(), + std::nullopt); + return success(); + } +}; /// This pattern handles the case of a split return value. struct TestSplitReturnType : public ConversionPattern { TestSplitReturnType(MLIRContext *ctx) @@ -1070,6 +1086,19 @@ struct TestCreateUnregisteredOp : public OpRewritePattern { return success(); }; }; + +class TestEraseOp : public ConversionPattern { +public: + TestEraseOp(MLIRContext *ctx) : ConversionPattern("test.erase_op", 1, ctx) {} + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + // Erase op without replacements. + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace namespace { @@ -1148,8 +1177,9 @@ struct TestLegalizePatternDriver TestUpdateConsumerType, TestNonRootReplacement, TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite, TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore, - TestUndoPropertiesModification>(&getContext()); - patterns.add(&getContext(), converter); + TestUndoPropertiesModification, TestEraseOp>(&getContext()); + patterns.add( + &getContext(), converter); mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns, converter); mlir::populateCallOpTypeConversionPattern(patterns, converter); From ebc7f5578033248ce7de52a7f374332a2fc201aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Thu, 5 Sep 2024 19:45:19 +0200 Subject: [PATCH 266/425] [llvm][SystemZ] Fix parsing of `.cfi_undefined` with percent-less registers. (#107032) This is just e3d658b applied to SystemZ. An example of this being used in the wild: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/s390/s390-64/start.S;h=59eeb7e998227bdf32029cd074f0876c450404ea;hb=HEAD#l63 --- .../SystemZ/AsmParser/SystemZAsmParser.cpp | 56 ++++---- llvm/test/MC/SystemZ/regs-bad.s | 3 - llvm/test/MC/SystemZ/regs-good.s | 128 ++++++++++++++++++ 3 files changed, 161 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index f2c04215d12dd..7c3898ac67312 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -416,7 +416,8 @@ class SystemZAsmParser : public MCTargetAsmParser { return static_cast(TS); } - bool parseRegister(Register &Reg, bool RestoreOnFailure = false); + bool parseRegister(Register &Reg, bool RequirePercent, + bool RestoreOnFailure = false); bool parseIntegerRegister(Register &Reg, RegisterGroup Group); @@ -495,7 +496,7 @@ class SystemZAsmParser : public MCTargetAsmParser { ParseStatus parseDirective(AsmToken DirectiveID) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; bool ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, SMLoc &EndLoc, - bool RestoreOnFailure); + bool RequirePercent, bool RestoreOnFailure); ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -756,26 +757,32 @@ void SystemZOperand::print(raw_ostream &OS) const { } // Parse one register of the form %. -bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) { - Reg.StartLoc = Parser.getTok().getLoc(); - - // Eat the % prefix. - if (Parser.getTok().isNot(AsmToken::Percent)) - return Error(Parser.getTok().getLoc(), "register expected"); +bool SystemZAsmParser::parseRegister(Register &Reg, bool RequirePercent, + bool RestoreOnFailure) { const AsmToken &PercentTok = Parser.getTok(); - Parser.Lex(); + bool HasPercent = PercentTok.is(AsmToken::Percent); + + Reg.StartLoc = PercentTok.getLoc(); + + if (RequirePercent && PercentTok.isNot(AsmToken::Percent)) + return Error(PercentTok.getLoc(), "register expected"); + + if (HasPercent) { + Parser.Lex(); // Eat percent token. + } // Expect a register name. if (Parser.getTok().isNot(AsmToken::Identifier)) { - if (RestoreOnFailure) + if (RestoreOnFailure && HasPercent) getLexer().UnLex(PercentTok); - return Error(Reg.StartLoc, "invalid register"); + return Error(Reg.StartLoc, + HasPercent ? "invalid register" : "register expected"); } // Check that there's a prefix. StringRef Name = Parser.getTok().getString(); if (Name.size() < 2) { - if (RestoreOnFailure) + if (RestoreOnFailure && HasPercent) getLexer().UnLex(PercentTok); return Error(Reg.StartLoc, "invalid register"); } @@ -783,7 +790,7 @@ bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) { // Treat the rest of the register name as a register number. if (Name.substr(1).getAsInteger(10, Reg.Num)) { - if (RestoreOnFailure) + if (RestoreOnFailure && HasPercent) getLexer().UnLex(PercentTok); return Error(Reg.StartLoc, "invalid register"); } @@ -800,7 +807,7 @@ bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) { else if (Prefix == 'c' && Reg.Num < 16) Reg.Group = RegCR; else { - if (RestoreOnFailure) + if (RestoreOnFailure && HasPercent) getLexer().UnLex(PercentTok); return Error(Reg.StartLoc, "invalid register"); } @@ -842,7 +849,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, // Handle register names of the form % if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) { - if (parseRegister(Reg)) + if (parseRegister(Reg, /*RequirePercent=*/true)) return ParseStatus::Failure; // Check the parsed register group "Reg.Group" with the expected "Group" @@ -918,7 +925,7 @@ ParseStatus SystemZAsmParser::parseAnyRegister(OperandVector &Operands) { return ParseStatus::NoMatch; Register Reg; - if (parseRegister(Reg)) + if (parseRegister(Reg, /*RequirePercent=*/true)) return ParseStatus::Failure; if (Reg.Num > 15) @@ -1025,7 +1032,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1, if (isParsingATT() && getLexer().is(AsmToken::Percent)) { // Parse the first register. HaveReg1 = true; - if (parseRegister(Reg1)) + if (parseRegister(Reg1, /*RequirePercent=*/true)) return true; } // So if we have an integer as the first token in ([tok1], ..), it could: @@ -1065,7 +1072,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1, if (parseIntegerRegister(Reg2, RegGR)) return true; } else { - if (isParsingATT() && parseRegister(Reg2)) + if (isParsingATT() && parseRegister(Reg2, /*RequirePercent=*/true)) return true; } } @@ -1355,9 +1362,10 @@ bool SystemZAsmParser::ParseGNUAttribute(SMLoc L) { } bool SystemZAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, - SMLoc &EndLoc, bool RestoreOnFailure) { + SMLoc &EndLoc, bool RequirePercent, + bool RestoreOnFailure) { Register Reg; - if (parseRegister(Reg, RestoreOnFailure)) + if (parseRegister(Reg, RequirePercent, RestoreOnFailure)) return true; if (Reg.Group == RegGR) RegNo = SystemZMC::GR64Regs[Reg.Num]; @@ -1376,12 +1384,14 @@ bool SystemZAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, bool SystemZAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) { - return ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/false); + return ParseRegister(Reg, StartLoc, EndLoc, /*RequirePercent=*/false, + /*RestoreOnFailure=*/false); } ParseStatus SystemZAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) { - bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/true); + bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RequirePercent=*/false, + /*RestoreOnFailure=*/true); bool PendingErrors = getParser().hasPendingError(); getParser().clearPendingErrors(); if (PendingErrors) @@ -1482,7 +1492,7 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, // the instruction isn't recognized. if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) { Register Reg; - if (parseRegister(Reg)) + if (parseRegister(Reg, /*RequirePercent=*/true)) return true; Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc)); return false; diff --git a/llvm/test/MC/SystemZ/regs-bad.s b/llvm/test/MC/SystemZ/regs-bad.s index 320cba0fc856c..6392ff2863002 100644 --- a/llvm/test/MC/SystemZ/regs-bad.s +++ b/llvm/test/MC/SystemZ/regs-bad.s @@ -262,8 +262,6 @@ # Test general register parsing, with no predetermined class in mind. # -#CHECK: error: register expected -#CHECK: .cfi_offset r0,0 #CHECK: error: invalid register #CHECK: .cfi_offset %,0 #CHECK: error: invalid register @@ -289,7 +287,6 @@ #CHECK: error: invalid register #CHECK: .cfi_offset %arid,0 - .cfi_offset r0,0 .cfi_offset %,0 .cfi_offset %r,0 .cfi_offset %f,0 diff --git a/llvm/test/MC/SystemZ/regs-good.s b/llvm/test/MC/SystemZ/regs-good.s index b4c1edd1b591e..49bf8a48ca4c2 100644 --- a/llvm/test/MC/SystemZ/regs-good.s +++ b/llvm/test/MC/SystemZ/regs-good.s @@ -176,6 +176,70 @@ st 0, 4095(1,15) st 0, 4095(15,1) +#CHECK: .cfi_offset %r0, 0 +#CHECK: .cfi_offset %r1, 8 +#CHECK: .cfi_offset %r2, 16 +#CHECK: .cfi_offset %r3, 24 +#CHECK: .cfi_offset %r4, 32 +#CHECK: .cfi_offset %r5, 40 +#CHECK: .cfi_offset %r6, 48 +#CHECK: .cfi_offset %r7, 56 +#CHECK: .cfi_offset %r8, 64 +#CHECK: .cfi_offset %r9, 72 +#CHECK: .cfi_offset %r10, 80 +#CHECK: .cfi_offset %r11, 88 +#CHECK: .cfi_offset %r12, 96 +#CHECK: .cfi_offset %r13, 104 +#CHECK: .cfi_offset %r14, 112 +#CHECK: .cfi_offset %r15, 120 +#CHECK: .cfi_offset %f0, 128 +#CHECK: .cfi_offset %f1, 136 +#CHECK: .cfi_offset %f2, 144 +#CHECK: .cfi_offset %f3, 152 +#CHECK: .cfi_offset %f4, 160 +#CHECK: .cfi_offset %f5, 168 +#CHECK: .cfi_offset %f6, 176 +#CHECK: .cfi_offset %f7, 184 +#CHECK: .cfi_offset %f8, 192 +#CHECK: .cfi_offset %f9, 200 +#CHECK: .cfi_offset %f10, 208 +#CHECK: .cfi_offset %f11, 216 +#CHECK: .cfi_offset %f12, 224 +#CHECK: .cfi_offset %f13, 232 +#CHECK: .cfi_offset %f14, 240 +#CHECK: .cfi_offset %f15, 248 +#CHECK: .cfi_offset %a0, 256 +#CHECK: .cfi_offset %a1, 260 +#CHECK: .cfi_offset %a2, 264 +#CHECK: .cfi_offset %a3, 268 +#CHECK: .cfi_offset %a4, 272 +#CHECK: .cfi_offset %a5, 276 +#CHECK: .cfi_offset %a6, 280 +#CHECK: .cfi_offset %a7, 284 +#CHECK: .cfi_offset %a8, 288 +#CHECK: .cfi_offset %r9, 292 +#CHECK: .cfi_offset %a10, 296 +#CHECK: .cfi_offset %a11, 300 +#CHECK: .cfi_offset %a12, 304 +#CHECK: .cfi_offset %a13, 308 +#CHECK: .cfi_offset %a14, 312 +#CHECK: .cfi_offset %a15, 316 +#CHECK: .cfi_offset %c0, 318 +#CHECK: .cfi_offset %c1, 326 +#CHECK: .cfi_offset %c2, 334 +#CHECK: .cfi_offset %c3, 342 +#CHECK: .cfi_offset %c4, 350 +#CHECK: .cfi_offset %c5, 358 +#CHECK: .cfi_offset %c6, 366 +#CHECK: .cfi_offset %c7, 374 +#CHECK: .cfi_offset %c8, 382 +#CHECK: .cfi_offset %c9, 390 +#CHECK: .cfi_offset %c10, 398 +#CHECK: .cfi_offset %c11, 406 +#CHECK: .cfi_offset %c12, 414 +#CHECK: .cfi_offset %c13, 422 +#CHECK: .cfi_offset %c14, 430 +#CHECK: .cfi_offset %c15, 438 #CHECK: .cfi_offset %r0, 0 #CHECK: .cfi_offset %r1, 8 #CHECK: .cfi_offset %r2, 16 @@ -306,4 +370,68 @@ .cfi_offset %c13,422 .cfi_offset %c14,430 .cfi_offset %c15,438 + .cfi_offset r0,0 + .cfi_offset r1,8 + .cfi_offset r2,16 + .cfi_offset r3,24 + .cfi_offset r4,32 + .cfi_offset r5,40 + .cfi_offset r6,48 + .cfi_offset r7,56 + .cfi_offset r8,64 + .cfi_offset r9,72 + .cfi_offset r10,80 + .cfi_offset r11,88 + .cfi_offset r12,96 + .cfi_offset r13,104 + .cfi_offset r14,112 + .cfi_offset r15,120 + .cfi_offset f0,128 + .cfi_offset f1,136 + .cfi_offset f2,144 + .cfi_offset f3,152 + .cfi_offset f4,160 + .cfi_offset f5,168 + .cfi_offset f6,176 + .cfi_offset f7,184 + .cfi_offset f8,192 + .cfi_offset f9,200 + .cfi_offset f10,208 + .cfi_offset f11,216 + .cfi_offset f12,224 + .cfi_offset f13,232 + .cfi_offset f14,240 + .cfi_offset f15,248 + .cfi_offset a0,256 + .cfi_offset a1,260 + .cfi_offset a2,264 + .cfi_offset a3,268 + .cfi_offset a4,272 + .cfi_offset a5,276 + .cfi_offset a6,280 + .cfi_offset a7,284 + .cfi_offset a8,288 + .cfi_offset r9,292 + .cfi_offset a10,296 + .cfi_offset a11,300 + .cfi_offset a12,304 + .cfi_offset a13,308 + .cfi_offset a14,312 + .cfi_offset a15,316 + .cfi_offset c0,318 + .cfi_offset c1,326 + .cfi_offset c2,334 + .cfi_offset c3,342 + .cfi_offset c4,350 + .cfi_offset c5,358 + .cfi_offset c6,366 + .cfi_offset c7,374 + .cfi_offset c8,382 + .cfi_offset c9,390 + .cfi_offset c10,398 + .cfi_offset c11,406 + .cfi_offset c12,414 + .cfi_offset c13,422 + .cfi_offset c14,430 + .cfi_offset c15,438 .cfi_endproc From bedac64d36dce88ea25bd444c60eaac7d420550e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Thu, 5 Sep 2024 19:45:53 +0200 Subject: [PATCH 267/425] [llvm][SystemZ] Recognize `@GOTENT` modifier in assembler. (#107038) Closes #105918. I'm unsure if there are other places that need to be updated for this. --- llvm/include/llvm/MC/MCExpr.h | 1 + llvm/lib/MC/MCExpr.cpp | 264 +++++++++--------- .../MCTargetDesc/SystemZELFObjectWriter.cpp | 1 + llvm/test/MC/SystemZ/fixups.s | 6 + 4 files changed, 141 insertions(+), 131 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 118b1dd88525a..10bc6ebd6fe50 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -192,6 +192,7 @@ class MCSymbolRefExpr : public MCExpr { VK_Invalid, VK_GOT, + VK_GOTENT, VK_GOTOFF, VK_GOTREL, VK_PCREL, diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index e4ca431c3d25f..c9d5f6580fda4 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -226,6 +226,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_DTPOFF: return "DTPOFF"; case VK_DTPREL: return "DTPREL"; case VK_GOT: return "GOT"; + case VK_GOTENT: return "GOTENT"; case VK_GOTOFF: return "GOTOFF"; case VK_GOTREL: return "GOTREL"; case VK_PCREL: return "PCREL"; @@ -404,137 +405,138 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { MCSymbolRefExpr::VariantKind MCSymbolRefExpr::getVariantKindForName(StringRef Name) { return StringSwitch(Name.lower()) - .Case("dtprel", VK_DTPREL) - .Case("dtpoff", VK_DTPOFF) - .Case("got", VK_GOT) - .Case("gotoff", VK_GOTOFF) - .Case("gotrel", VK_GOTREL) - .Case("pcrel", VK_PCREL) - .Case("gotpcrel", VK_GOTPCREL) - .Case("gotpcrel_norelax", VK_GOTPCREL_NORELAX) - .Case("gottpoff", VK_GOTTPOFF) - .Case("indntpoff", VK_INDNTPOFF) - .Case("ntpoff", VK_NTPOFF) - .Case("gotntpoff", VK_GOTNTPOFF) - .Case("plt", VK_PLT) - .Case("tlscall", VK_TLSCALL) - .Case("tlsdesc", VK_TLSDESC) - .Case("tlsgd", VK_TLSGD) - .Case("tlsld", VK_TLSLD) - .Case("tlsldm", VK_TLSLDM) - .Case("tpoff", VK_TPOFF) - .Case("tprel", VK_TPREL) - .Case("tlvp", VK_TLVP) - .Case("tlvppage", VK_TLVPPAGE) - .Case("tlvppageoff", VK_TLVPPAGEOFF) - .Case("page", VK_PAGE) - .Case("pageoff", VK_PAGEOFF) - .Case("gotpage", VK_GOTPAGE) - .Case("gotpageoff", VK_GOTPAGEOFF) - .Case("imgrel", VK_COFF_IMGREL32) - .Case("secrel32", VK_SECREL) - .Case("size", VK_SIZE) - .Case("abs8", VK_X86_ABS8) - .Case("pltoff", VK_X86_PLTOFF) - .Case("l", VK_PPC_LO) - .Case("h", VK_PPC_HI) - .Case("ha", VK_PPC_HA) - .Case("high", VK_PPC_HIGH) - .Case("higha", VK_PPC_HIGHA) - .Case("higher", VK_PPC_HIGHER) - .Case("highera", VK_PPC_HIGHERA) - .Case("highest", VK_PPC_HIGHEST) - .Case("highesta", VK_PPC_HIGHESTA) - .Case("got@l", VK_PPC_GOT_LO) - .Case("got@h", VK_PPC_GOT_HI) - .Case("got@ha", VK_PPC_GOT_HA) - .Case("local", VK_PPC_LOCAL) - .Case("tocbase", VK_PPC_TOCBASE) - .Case("toc", VK_PPC_TOC) - .Case("toc@l", VK_PPC_TOC_LO) - .Case("toc@h", VK_PPC_TOC_HI) - .Case("toc@ha", VK_PPC_TOC_HA) - .Case("u", VK_PPC_U) - .Case("l", VK_PPC_L) - .Case("tls", VK_PPC_TLS) - .Case("dtpmod", VK_PPC_DTPMOD) - .Case("tprel@l", VK_PPC_TPREL_LO) - .Case("tprel@h", VK_PPC_TPREL_HI) - .Case("tprel@ha", VK_PPC_TPREL_HA) - .Case("tprel@high", VK_PPC_TPREL_HIGH) - .Case("tprel@higha", VK_PPC_TPREL_HIGHA) - .Case("tprel@higher", VK_PPC_TPREL_HIGHER) - .Case("tprel@highera", VK_PPC_TPREL_HIGHERA) - .Case("tprel@highest", VK_PPC_TPREL_HIGHEST) - .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA) - .Case("dtprel@l", VK_PPC_DTPREL_LO) - .Case("dtprel@h", VK_PPC_DTPREL_HI) - .Case("dtprel@ha", VK_PPC_DTPREL_HA) - .Case("dtprel@high", VK_PPC_DTPREL_HIGH) - .Case("dtprel@higha", VK_PPC_DTPREL_HIGHA) - .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER) - .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA) - .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST) - .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA) - .Case("got@tprel", VK_PPC_GOT_TPREL) - .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO) - .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI) - .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA) - .Case("got@dtprel", VK_PPC_GOT_DTPREL) - .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO) - .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI) - .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA) - .Case("got@tlsgd", VK_PPC_GOT_TLSGD) - .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO) - .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI) - .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA) - .Case("got@tlsld", VK_PPC_GOT_TLSLD) - .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO) - .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI) - .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA) - .Case("got@pcrel", VK_PPC_GOT_PCREL) - .Case("got@tlsgd@pcrel", VK_PPC_GOT_TLSGD_PCREL) - .Case("got@tlsld@pcrel", VK_PPC_GOT_TLSLD_PCREL) - .Case("got@tprel@pcrel", VK_PPC_GOT_TPREL_PCREL) - .Case("tls@pcrel", VK_PPC_TLS_PCREL) - .Case("notoc", VK_PPC_NOTOC) - .Case("gdgot", VK_Hexagon_GD_GOT) - .Case("gdplt", VK_Hexagon_GD_PLT) - .Case("iegot", VK_Hexagon_IE_GOT) - .Case("ie", VK_Hexagon_IE) - .Case("ldgot", VK_Hexagon_LD_GOT) - .Case("ldplt", VK_Hexagon_LD_PLT) - .Case("lo8", VK_AVR_LO8) - .Case("hi8", VK_AVR_HI8) - .Case("hlo8", VK_AVR_HLO8) - .Case("typeindex", VK_WASM_TYPEINDEX) - .Case("tbrel", VK_WASM_TBREL) - .Case("mbrel", VK_WASM_MBREL) - .Case("tlsrel", VK_WASM_TLSREL) - .Case("got@tls", VK_WASM_GOT_TLS) - .Case("funcindex", VK_WASM_FUNCINDEX) - .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO) - .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI) - .Case("rel32@lo", VK_AMDGPU_REL32_LO) - .Case("rel32@hi", VK_AMDGPU_REL32_HI) - .Case("rel64", VK_AMDGPU_REL64) - .Case("abs32@lo", VK_AMDGPU_ABS32_LO) - .Case("abs32@hi", VK_AMDGPU_ABS32_HI) - .Case("hi", VK_VE_HI32) - .Case("lo", VK_VE_LO32) - .Case("pc_hi", VK_VE_PC_HI32) - .Case("pc_lo", VK_VE_PC_LO32) - .Case("got_hi", VK_VE_GOT_HI32) - .Case("got_lo", VK_VE_GOT_LO32) - .Case("gotoff_hi", VK_VE_GOTOFF_HI32) - .Case("gotoff_lo", VK_VE_GOTOFF_LO32) - .Case("plt_hi", VK_VE_PLT_HI32) - .Case("plt_lo", VK_VE_PLT_LO32) - .Case("tls_gd_hi", VK_VE_TLS_GD_HI32) - .Case("tls_gd_lo", VK_VE_TLS_GD_LO32) - .Case("tpoff_hi", VK_VE_TPOFF_HI32) - .Case("tpoff_lo", VK_VE_TPOFF_LO32) - .Default(VK_Invalid); + .Case("dtprel", VK_DTPREL) + .Case("dtpoff", VK_DTPOFF) + .Case("got", VK_GOT) + .Case("gotent", VK_GOTENT) + .Case("gotoff", VK_GOTOFF) + .Case("gotrel", VK_GOTREL) + .Case("pcrel", VK_PCREL) + .Case("gotpcrel", VK_GOTPCREL) + .Case("gotpcrel_norelax", VK_GOTPCREL_NORELAX) + .Case("gottpoff", VK_GOTTPOFF) + .Case("indntpoff", VK_INDNTPOFF) + .Case("ntpoff", VK_NTPOFF) + .Case("gotntpoff", VK_GOTNTPOFF) + .Case("plt", VK_PLT) + .Case("tlscall", VK_TLSCALL) + .Case("tlsdesc", VK_TLSDESC) + .Case("tlsgd", VK_TLSGD) + .Case("tlsld", VK_TLSLD) + .Case("tlsldm", VK_TLSLDM) + .Case("tpoff", VK_TPOFF) + .Case("tprel", VK_TPREL) + .Case("tlvp", VK_TLVP) + .Case("tlvppage", VK_TLVPPAGE) + .Case("tlvppageoff", VK_TLVPPAGEOFF) + .Case("page", VK_PAGE) + .Case("pageoff", VK_PAGEOFF) + .Case("gotpage", VK_GOTPAGE) + .Case("gotpageoff", VK_GOTPAGEOFF) + .Case("imgrel", VK_COFF_IMGREL32) + .Case("secrel32", VK_SECREL) + .Case("size", VK_SIZE) + .Case("abs8", VK_X86_ABS8) + .Case("pltoff", VK_X86_PLTOFF) + .Case("l", VK_PPC_LO) + .Case("h", VK_PPC_HI) + .Case("ha", VK_PPC_HA) + .Case("high", VK_PPC_HIGH) + .Case("higha", VK_PPC_HIGHA) + .Case("higher", VK_PPC_HIGHER) + .Case("highera", VK_PPC_HIGHERA) + .Case("highest", VK_PPC_HIGHEST) + .Case("highesta", VK_PPC_HIGHESTA) + .Case("got@l", VK_PPC_GOT_LO) + .Case("got@h", VK_PPC_GOT_HI) + .Case("got@ha", VK_PPC_GOT_HA) + .Case("local", VK_PPC_LOCAL) + .Case("tocbase", VK_PPC_TOCBASE) + .Case("toc", VK_PPC_TOC) + .Case("toc@l", VK_PPC_TOC_LO) + .Case("toc@h", VK_PPC_TOC_HI) + .Case("toc@ha", VK_PPC_TOC_HA) + .Case("u", VK_PPC_U) + .Case("l", VK_PPC_L) + .Case("tls", VK_PPC_TLS) + .Case("dtpmod", VK_PPC_DTPMOD) + .Case("tprel@l", VK_PPC_TPREL_LO) + .Case("tprel@h", VK_PPC_TPREL_HI) + .Case("tprel@ha", VK_PPC_TPREL_HA) + .Case("tprel@high", VK_PPC_TPREL_HIGH) + .Case("tprel@higha", VK_PPC_TPREL_HIGHA) + .Case("tprel@higher", VK_PPC_TPREL_HIGHER) + .Case("tprel@highera", VK_PPC_TPREL_HIGHERA) + .Case("tprel@highest", VK_PPC_TPREL_HIGHEST) + .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA) + .Case("dtprel@l", VK_PPC_DTPREL_LO) + .Case("dtprel@h", VK_PPC_DTPREL_HI) + .Case("dtprel@ha", VK_PPC_DTPREL_HA) + .Case("dtprel@high", VK_PPC_DTPREL_HIGH) + .Case("dtprel@higha", VK_PPC_DTPREL_HIGHA) + .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER) + .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA) + .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST) + .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA) + .Case("got@tprel", VK_PPC_GOT_TPREL) + .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO) + .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI) + .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA) + .Case("got@dtprel", VK_PPC_GOT_DTPREL) + .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO) + .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI) + .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA) + .Case("got@tlsgd", VK_PPC_GOT_TLSGD) + .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO) + .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI) + .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA) + .Case("got@tlsld", VK_PPC_GOT_TLSLD) + .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO) + .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI) + .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA) + .Case("got@pcrel", VK_PPC_GOT_PCREL) + .Case("got@tlsgd@pcrel", VK_PPC_GOT_TLSGD_PCREL) + .Case("got@tlsld@pcrel", VK_PPC_GOT_TLSLD_PCREL) + .Case("got@tprel@pcrel", VK_PPC_GOT_TPREL_PCREL) + .Case("tls@pcrel", VK_PPC_TLS_PCREL) + .Case("notoc", VK_PPC_NOTOC) + .Case("gdgot", VK_Hexagon_GD_GOT) + .Case("gdplt", VK_Hexagon_GD_PLT) + .Case("iegot", VK_Hexagon_IE_GOT) + .Case("ie", VK_Hexagon_IE) + .Case("ldgot", VK_Hexagon_LD_GOT) + .Case("ldplt", VK_Hexagon_LD_PLT) + .Case("lo8", VK_AVR_LO8) + .Case("hi8", VK_AVR_HI8) + .Case("hlo8", VK_AVR_HLO8) + .Case("typeindex", VK_WASM_TYPEINDEX) + .Case("tbrel", VK_WASM_TBREL) + .Case("mbrel", VK_WASM_MBREL) + .Case("tlsrel", VK_WASM_TLSREL) + .Case("got@tls", VK_WASM_GOT_TLS) + .Case("funcindex", VK_WASM_FUNCINDEX) + .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO) + .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI) + .Case("rel32@lo", VK_AMDGPU_REL32_LO) + .Case("rel32@hi", VK_AMDGPU_REL32_HI) + .Case("rel64", VK_AMDGPU_REL64) + .Case("abs32@lo", VK_AMDGPU_ABS32_LO) + .Case("abs32@hi", VK_AMDGPU_ABS32_HI) + .Case("hi", VK_VE_HI32) + .Case("lo", VK_VE_LO32) + .Case("pc_hi", VK_VE_PC_HI32) + .Case("pc_lo", VK_VE_PC_LO32) + .Case("got_hi", VK_VE_GOT_HI32) + .Case("got_lo", VK_VE_GOT_LO32) + .Case("gotoff_hi", VK_VE_GOTOFF_HI32) + .Case("gotoff_lo", VK_VE_GOTOFF_LO32) + .Case("plt_hi", VK_VE_PLT_HI32) + .Case("plt_lo", VK_VE_PLT_LO32) + .Case("tls_gd_hi", VK_VE_TLS_GD_HI32) + .Case("tls_gd_lo", VK_VE_TLS_GD_LO32) + .Case("tpoff_hi", VK_VE_TPOFF_HI32) + .Case("tpoff_lo", VK_VE_TPOFF_LO32) + .Default(VK_Invalid); } /* *** */ diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index de1eedb8daff6..e44b4a5236915 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -185,6 +185,7 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx, return getTLSGDReloc(Ctx, Loc, Kind); case MCSymbolRefExpr::VK_GOT: + case MCSymbolRefExpr::VK_GOTENT: if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL) return ELF::R_390_GOTENT; Ctx.reportError(Loc, "Only PC-relative GOT accesses are supported for now"); diff --git a/llvm/test/MC/SystemZ/fixups.s b/llvm/test/MC/SystemZ/fixups.s index ad0b1f18cdcfd..0d59e60fc5957 100644 --- a/llvm/test/MC/SystemZ/fixups.s +++ b/llvm/test/MC/SystemZ/fixups.s @@ -19,6 +19,12 @@ .align 16 larl %r14, target@got +# CHECK: larl %r14, target@GOTENT # encoding: [0xc0,0xe0,A,A,A,A] +# CHECK-NEXT: # fixup A - offset: 2, value: target@GOTENT+2, kind: FK_390_PC32DBL +# CHECK-REL: 0x{{[0-9A-F]*2}} R_390_GOTENT target 0x2 + .align 16 + larl %r14, target@gotent + # CHECK: larl %r14, target@INDNTPOFF # encoding: [0xc0,0xe0,A,A,A,A] # CHECK-NEXT: # fixup A - offset: 2, value: target@INDNTPOFF+2, kind: FK_390_PC32DBL # CHECK-REL: 0x{{[0-9A-F]*2}} R_390_TLS_IEENT target 0x2 From 797f01198e8b41982916ba02d703bd6a96b5347e Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Thu, 5 Sep 2024 14:55:01 -0300 Subject: [PATCH 268/425] [flang][OpenMP] Make lastprivate work with reallocated variables (#106559) Fixes https://github.com/llvm/llvm-project/issues/100951 --- flang/lib/Lower/Bridge.cpp | 56 ++++++++---------- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 51 +---------------- flang/runtime/assign.cpp | 2 +- .../DelayedPrivatization/equivalence.f90 | 2 +- flang/test/Lower/OpenMP/copyin-order.f90 | 4 +- flang/test/Lower/OpenMP/copyin.f90 | 20 +++---- flang/test/Lower/OpenMP/copyprivate.f90 | 2 +- flang/test/Lower/OpenMP/copyprivate2.f90 | 9 +-- .../Lower/OpenMP/default-clause-byref.f90 | 6 +- ...elayed-privatization-allocatable-array.f90 | 2 +- ...privatization-allocatable-firstprivate.f90 | 2 +- .../OpenMP/delayed-privatization-array.f90 | 4 +- .../delayed-privatization-firstprivate.f90 | 2 +- flang/test/Lower/OpenMP/implicit-dsa.f90 | 26 ++++----- .../Lower/OpenMP/lastprivate-allocatable.f90 | 57 ++++++++++++++----- .../Lower/OpenMP/lastprivate-commonblock.f90 | 4 +- flang/test/Lower/OpenMP/lastprivate-iv.f90 | 6 +- .../parallel-lastprivate-clause-scalar.f90 | 22 +++---- .../OpenMP/parallel-wsloop-firstpriv.f90 | 6 +- .../Lower/OpenMP/parallel-wsloop-lastpriv.f90 | 10 ++-- flang/test/Lower/OpenMP/parallel-wsloop.f90 | 8 +-- flang/test/Lower/OpenMP/sections.f90 | 20 +++---- flang/test/Lower/OpenMP/single.f90 | 4 +- .../test/Lower/OpenMP/statement-function.f90 | 4 +- flang/test/Lower/OpenMP/task.f90 | 6 +- flang/test/Lower/OpenMP/task2.f90 | 9 +-- 26 files changed, 157 insertions(+), 187 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 1f2724290b885..7cdecb788425a 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -858,7 +858,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { copyVarHLFIR(loc, Fortran::lower::SymbolBox::Intrinsic{dst}, Fortran::lower::SymbolBox::Intrinsic{src}, isAllocatable, - isPointer); + isPointer, Fortran::semantics::Symbol::Flags()); } void copyHostAssociateVar( @@ -895,7 +895,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { rhs_sb = &hsb; } - copyVar(sym, *lhs_sb, *rhs_sb); + copyVar(sym, *lhs_sb, *rhs_sb, sym.flags()); if (copyAssignIP && copyAssignIP->isSet() && sym.test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) { @@ -1211,16 +1211,18 @@ class FirConverter : public Fortran::lower::AbstractConverter { void copyVar(const Fortran::semantics::Symbol &sym, const Fortran::lower::SymbolBox &lhs_sb, - const Fortran::lower::SymbolBox &rhs_sb) { + const Fortran::lower::SymbolBox &rhs_sb, + Fortran::semantics::Symbol::Flags flags) { mlir::Location loc = genLocation(sym.name()); if (lowerToHighLevelFIR()) - copyVarHLFIR(loc, lhs_sb, rhs_sb); + copyVarHLFIR(loc, lhs_sb, rhs_sb, flags); else copyVarFIR(loc, sym, lhs_sb, rhs_sb); } void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst, - Fortran::lower::SymbolBox src) { + Fortran::lower::SymbolBox src, + Fortran::semantics::Symbol::Flags flags) { assert(lowerToHighLevelFIR()); bool isBoxAllocatable = dst.match( @@ -1237,51 +1239,39 @@ class FirConverter : public Fortran::lower::AbstractConverter { }, [](const auto &box) { return false; }); - copyVarHLFIR(loc, dst, src, isBoxAllocatable, isBoxPointer); + copyVarHLFIR(loc, dst, src, isBoxAllocatable, isBoxPointer, flags); } void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst, Fortran::lower::SymbolBox src, bool isAllocatable, - bool isPointer) { + bool isPointer, Fortran::semantics::Symbol::Flags flags) { assert(lowerToHighLevelFIR()); hlfir::Entity lhs{dst.getAddr()}; hlfir::Entity rhs{src.getAddr()}; - // Temporary_lhs is set to true in hlfir.assign below to avoid user - // assignment to be used and finalization to be called on the LHS. - // This may or may not be correct but mimics the current behaviour - // without HLFIR. + auto copyData = [&](hlfir::Entity l, hlfir::Entity r) { // Dereference RHS and load it if trivial scalar. r = hlfir::loadTrivialScalar(loc, *builder, r); - builder->create( - loc, r, l, - /*isWholeAllocatableAssignment=*/false, - /*keepLhsLengthInAllocatableAssignment=*/false, - /*temporary_lhs=*/true); + builder->create(loc, r, l, isAllocatable); }; - if (isAllocatable) { - // Deep copy allocatable if it is allocated. - // Note that when allocated, the RHS is already allocated with the LHS - // shape for copy on entry in createHostAssociateVarClone. - // For lastprivate, this assumes that the RHS was not reallocated in - // the OpenMP region. - lhs = hlfir::derefPointersAndAllocatables(loc, *builder, lhs); - mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, lhs); - mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr); - builder->genIfThen(loc, isAllocated) - .genThen([&]() { - // Copy the DATA, not the descriptors. - copyData(lhs, rhs); - }) - .end(); - } else if (isPointer) { + if (isPointer) { // Set LHS target to the target of RHS (do not copy the RHS // target data into the LHS target storage). auto loadVal = builder->create(loc, rhs); builder->create(loc, loadVal, lhs); + } else if (isAllocatable && + flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) { + // For firstprivate allocatable variables, RHS must be copied only when + // LHS is allocated. + hlfir::Entity temp = + hlfir::derefPointersAndAllocatables(loc, *builder, lhs); + mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, temp); + mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr); + builder->genIfThen(loc, isAllocated) + .genThen([&]() { copyData(lhs, rhs); }) + .end(); } else { - // Non ALLOCATABLE/POINTER variable. Simple DATA copy. copyData(lhs, rhs); } } diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index a2003473a0fd8..78c3aa5a1c16c 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -132,54 +132,9 @@ void DataSharingProcessor::copyFirstPrivateSymbol( } void DataSharingProcessor::copyLastPrivateSymbol( - const semantics::Symbol *sym, - [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) { - if (sym->test(semantics::Symbol::Flag::OmpLastPrivate)) { - bool allocatable = semantics::IsAllocatable(sym->GetUltimate()); - if (!allocatable) { - converter.copyHostAssociateVar(*sym, lastPrivIP); - return; - } - - // copyHostAssociateVar doesn't work properly if the privatised copy was - // reallocated (e.g. by assignment): it will only copy if the ultimate - // symbol was already allocated, and it only copies data so any reallocated - // lengths etc are lost - - // 1) Fetch the original copy of the variable. - assert(sym->has() && - "No host-association found"); - const Fortran::semantics::Symbol &hsym = sym->GetUltimate(); - Fortran::lower::SymbolBox hsb = symTable->lookupOneLevelUpSymbol(hsym); - assert(hsb && "Host symbol box not found"); - - // 2) Fetch the copied one that will mask the original. - Fortran::lower::SymbolBox sb = symTable->shallowLookupSymbol(sym); - assert(sb && "Host-associated symbol box not found"); - assert(hsb.getAddr() != sb.getAddr() && - "Host and associated symbol boxes are the same"); - - // 3) Perform the assignment. - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - mlir::Location loc = converter.genLocation(sym->name()); - mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint(); - if (lastPrivIP && lastPrivIP->isSet()) - builder.restoreInsertionPoint(*lastPrivIP); - else - builder.setInsertionPointAfter(sb.getAddr().getDefiningOp()); - - hlfir::Entity dst{hsb.getAddr()}; - hlfir::Entity src{sb.getAddr()}; - builder.create( - loc, src, dst, /*isWholeAllocatableAssignment=*/allocatable, - /*keepLhsLengthInAllocatableAssignment=*/false, - /*temporary_lhs=*/false); - - if (lastPrivIP && lastPrivIP->isSet() && - sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) { - builder.restoreInsertionPoint(insPt); - } - } + const semantics::Symbol *sym, mlir::OpBuilder::InsertPoint *lastPrivIP) { + if (sym->test(semantics::Symbol::Flag::OmpLastPrivate)) + converter.copyHostAssociateVar(*sym, lastPrivIP); } void DataSharingProcessor::collectOmpObjectListSymbol( diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp index c3c9b0ba10ab3..d558ada51cd21 100644 --- a/flang/runtime/assign.cpp +++ b/flang/runtime/assign.cpp @@ -591,7 +591,7 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from, } } - Assign(to, from, terminator, PolymorphicLHS); + Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS); } void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var, diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 index 1cbbcdcd0e4fd..2307c09513795 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 @@ -22,7 +22,7 @@ subroutine private_common ! CHECK: } copy { ! CHECK: ^bb0(%[[ORIG_PTR:.*]]: ![[X_TYPE]], %[[PRIV_REF:.*]]: ![[X_TYPE]]): ! CHECK: %[[ORIG_VAL:.*]] = fir.load %[[ORIG_PTR]] : !fir.ptr -! CHECK: hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] temporary_lhs : f32, ![[X_TYPE]] +! CHECK: hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] : f32, ![[X_TYPE]] ! CHECK: omp.yield(%[[PRIV_REF]] : ![[X_TYPE]]) ! CHECK: } diff --git a/flang/test/Lower/OpenMP/copyin-order.f90 b/flang/test/Lower/OpenMP/copyin-order.f90 index 8999c247eebdc..ffbbad00e7cbd 100644 --- a/flang/test/Lower/OpenMP/copyin-order.f90 +++ b/flang/test/Lower/OpenMP/copyin-order.f90 @@ -6,11 +6,11 @@ !CHECK: %[[THP1:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1 !CHECK: %[[DCL1:[0-9]+]]:2 = hlfir.declare %[[THP1]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} !CHECK: %[[LD1:[0-9]+]] = fir.load %{{[0-9]+}}#0 -!CHECK: hlfir.assign %[[LD1]] to %[[DCL1]]#0 temporary_lhs +!CHECK: hlfir.assign %[[LD1]] to %[[DCL1]]#0 !CHECK: %[[THP2:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1 !CHECK: %[[SHP2:[0-9]+]] = fir.shape %c{{[0-9]+}} !CHECK: %[[DCL2:[0-9]+]]:2 = hlfir.declare %[[THP2]](%[[SHP2]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} -!CHECK: hlfir.assign %{{[0-9]+}}#0 to %[[DCL2]]#0 temporary_lhs +!CHECK: hlfir.assign %{{[0-9]+}}#0 to %[[DCL2]]#0 !CHECK: omp.barrier !CHECK: fir.call @_QPsub1(%[[DCL1]]#1, %[[DCL2]]#1) !CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90 index 34c83fca46417..4023987a841b8 100644 --- a/flang/test/Lower/OpenMP/copyin.f90 +++ b/flang/test/Lower/OpenMP/copyin.f90 @@ -21,11 +21,11 @@ ! CHECK: %[[VAL_11:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref -> !fir.ref ! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 : i32, !fir.ref ! CHECK: %[[VAL_14:.*]] = omp.threadprivate %[[VAL_7]]#1 : !fir.ref> -> !fir.ref> ! CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) -! CHECK: hlfir.assign %[[VAL_10]]#0 to %[[VAL_16]]#0 temporary_lhs : !fir.ref>, !fir.ref> +! CHECK: hlfir.assign %[[VAL_10]]#0 to %[[VAL_16]]#0 : !fir.ref>, !fir.ref> ! CHECK: omp.barrier ! CHECK: fir.call @_QPsub1(%[[VAL_12]]#1, %[[VAL_16]]#1) fastmath : (!fir.ref, !fir.ref>) -> () ! CHECK: omp.terminator @@ -61,11 +61,11 @@ subroutine copyin_scalar_array() ! CHECK: omp.parallel { ! CHECK: %[[VAL_13:.*]] = omp.threadprivate %[[VAL_2]]#1 : !fir.ref> -> !fir.ref> ! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] typeparams %[[VAL_1]] {uniq_name = "_QFcopyin_char_chararrayEx3"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) -! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_14]]#0 temporary_lhs : !fir.ref>, !fir.ref> +! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_14]]#0 : !fir.ref>, !fir.ref> ! CHECK: %[[VAL_15:.*]] = omp.threadprivate %[[VAL_9]]#1 : !fir.ref>> -> !fir.ref>> ! CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) typeparams %[[VAL_6]] {uniq_name = "_QFcopyin_char_chararrayEx4"} : (!fir.ref>>, !fir.shape<1>, index) -> (!fir.ref>>, !fir.ref>>) -! CHECK: hlfir.assign %[[VAL_12]]#0 to %[[VAL_17]]#0 temporary_lhs : !fir.ref>>, !fir.ref>> +! CHECK: hlfir.assign %[[VAL_12]]#0 to %[[VAL_17]]#0 : !fir.ref>>, !fir.ref>> ! CHECK: omp.barrier ! CHECK: %[[VAL_18:.*]] = fir.emboxchar %[[VAL_14]]#1, %[[VAL_1]] : (!fir.ref>, index) -> !fir.boxchar<1> ! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_17]]#1 : (!fir.ref>>) -> !fir.ref> @@ -116,7 +116,7 @@ subroutine copyin_char_chararray() ! CHECK: omp.parallel { ! CHECK: %[[VAL_27:.*]] = omp.threadprivate %[[VAL_17]]#1 : !fir.ref}>> -> !fir.ref}>> ! CHECK: %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_27]] {uniq_name = "_QFcopyin_derived_typeEx5"} : (!fir.ref}>>) -> (!fir.ref}>>, !fir.ref}>>) -! CHECK: hlfir.assign %[[VAL_19]]#0 to %[[VAL_28]]#0 temporary_lhs : !fir.ref}>>, !fir.ref}>> +! CHECK: hlfir.assign %[[VAL_19]]#0 to %[[VAL_28]]#0 : !fir.ref}>>, !fir.ref}>> ! CHECK: omp.barrier ! CHECK: fir.call @_QPsub3(%[[VAL_28]]#1) fastmath : (!fir.ref}>>) -> () ! CHECK: omp.terminator @@ -150,7 +150,7 @@ subroutine copyin_derived_type() ! CHECK: %[[VAL_8:.*]] = omp.threadprivate %[[VAL_3]]#1 : !fir.ref -> !fir.ref ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFcombined_parallel_worksharing_loopEx6"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_9]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_9]]#0 : i32, !fir.ref ! CHECK: omp.barrier @@ -194,7 +194,7 @@ subroutine combined_parallel_worksharing_loop() ! CHECK: %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref -> !fir.ref ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFcombined_parallel_sectionsEx7"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 : i32, !fir.ref ! CHECK: omp.barrier ! CHECK: omp.sections { ! CHECK: omp.section { @@ -247,7 +247,7 @@ subroutine combined_parallel_sections() ! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref) -> !fir.ref ! CHECK: %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFcommon_1Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_20]] to %[[VAL_19]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_20]] to %[[VAL_19]]#0 : i32, !fir.ref ! CHECK: omp.barrier ! CHECK: omp.sections { ! CHECK: omp.section { @@ -318,9 +318,9 @@ subroutine common_1() ! CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.ref) -> !fir.ref ! CHECK: %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFcommon_2Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_32:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_32]] to %[[VAL_26]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_32]] to %[[VAL_26]]#0 : i32, !fir.ref ! CHECK: %[[VAL_33:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref -! CHECK: hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 : i32, !fir.ref ! CHECK: omp.barrier ! CHECK: %[[VAL_19:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} diff --git a/flang/test/Lower/OpenMP/copyprivate.f90 b/flang/test/Lower/OpenMP/copyprivate.f90 index 4b3d8a6b596ef..bb86aa5356b1e 100644 --- a/flang/test/Lower/OpenMP/copyprivate.f90 +++ b/flang/test/Lower/OpenMP/copyprivate.f90 @@ -35,7 +35,7 @@ !CHECK-NEXT: %[[DST:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_copy_i32_dst"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_copy_i32_src"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[SRC_VAL:.*]] = fir.load %[[SRC]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[SRC_VAL]] to %[[DST]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[SRC_VAL]] to %[[DST]]#0 : i32, !fir.ref !CHECK-NEXT: return !CHECK-NEXT: } diff --git a/flang/test/Lower/OpenMP/copyprivate2.f90 b/flang/test/Lower/OpenMP/copyprivate2.f90 index 38235e8ec79c3..848ebe39e45f4 100644 --- a/flang/test/Lower/OpenMP/copyprivate2.f90 +++ b/flang/test/Lower/OpenMP/copyprivate2.f90 @@ -24,12 +24,9 @@ !CHECK-NEXT: %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs, !CHECK-SAME: uniq_name = "_copy_box_heap_Uxi32_src"} : (!fir.ref>>>) -> !CHECK-SAME: (!fir.ref>>>, !fir.ref>>>) -!CHECK-NEXT: %[[DST_BOX:.*]] = fir.load %[[DST]]#0 : !fir.ref>>> -!CHECK: fir.if %{{.*}} { -!CHECK-NEXT: %[[SRC_BOX:.*]] = fir.load %[[SRC]]#0 : !fir.ref>>> -!CHECK-NEXT: hlfir.assign %[[SRC_BOX]] to %[[DST_BOX]] temporary_lhs : !fir.box>>, -!CHECK-SAME: !fir.box>> -!CHECK-NEXT: } +!CHECK-NEXT: %[[SRC_BOX:.*]] = fir.load %[[SRC]]#0 : !fir.ref>>> +!CHECK-NEXT: hlfir.assign %[[SRC_BOX]] to %[[DST]]#0 realloc : !fir.box>>, +!CHECK-SAME: !fir.box>> !CHECK-NEXT: return !CHECK-NEXT: } diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90 index 626ba3335a8c1..7e9011f9c1bd5 100644 --- a/flang/test/Lower/OpenMP/default-clause-byref.f90 +++ b/flang/test/Lower/OpenMP/default-clause-byref.f90 @@ -15,7 +15,7 @@ !CHECK: } copy { !CHECK: ^bb0(%[[ORIG_W:.*]]: !fir.ref, %[[PRIV_W:.*]]: !fir.ref): !CHECK: %[[ORIG_W_VAL:.*]] = fir.load %[[ORIG_W]] -!CHECK: hlfir.assign %[[ORIG_W_VAL]] to %[[PRIV_W]] temporary_lhs +!CHECK: hlfir.assign %[[ORIG_W_VAL]] to %[[PRIV_W]] !CHECK: omp.yield(%[[PRIV_W]] : !fir.ref) !CHECK: } @@ -27,7 +27,7 @@ !CHECK: } copy { !CHECK: ^bb0(%[[ORIG_Y:.*]]: !fir.ref, %[[PRIV_Y:.*]]: !fir.ref): !CHECK: %[[ORIG_Y_VAL:.*]] = fir.load %[[ORIG_Y]] -!CHECK: hlfir.assign %[[ORIG_Y_VAL]] to %[[PRIV_Y]] temporary_lhs +!CHECK: hlfir.assign %[[ORIG_Y_VAL]] to %[[PRIV_Y]] !CHECK: omp.yield(%[[PRIV_Y]] : !fir.ref) !CHECK: } @@ -60,7 +60,7 @@ !CHECK: } copy { !CHECK: ^bb0(%[[ORIG_X:.*]]: !fir.ref, %[[PRIV_X:.*]]: !fir.ref): !CHECK: %[[ORIG_X_VAL:.*]] = fir.load %[[ORIG_X]] -!CHECK: hlfir.assign %[[ORIG_X_VAL]] to %[[PRIV_X]] temporary_lhs +!CHECK: hlfir.assign %[[ORIG_X_VAL]] to %[[PRIV_X]] !CHECK: omp.yield(%[[PRIV_X]] : !fir.ref) !CHECK: } diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 index 47e163014fe86..759d80cf45b2a 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 @@ -61,7 +61,7 @@ subroutine delayed_privatization_private(var1, l1) ! CHECK-NEXT: fir.if %[[COPY_COND]] { ! CHECK-NEXT: %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] -! CHECK-NEXT: hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_BASE_VAL]] temporary_lhs +! CHECK-NEXT: hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_PRIV_ARG]] realloc ! CHECK-NEXT: } ! CHECK-NEXT: omp.yield ! CHECK-NEXT: } diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 index 5f09371bbaba2..9c97c689dad70 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 @@ -35,7 +35,7 @@ subroutine delayed_privatization_allocatable ! CHECK-NEXT: %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] ! CHECK-NEXT: %[[ORIG_BASE_ADDR:.*]] = fir.box_addr %[[ORIG_BASE_VAL]] ! CHECK-NEXT: %[[ORIG_BASE_LD:.*]] = fir.load %[[ORIG_BASE_ADDR]] -! CHECK-NEXT: hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_BASE_BOX]] temporary_lhs +! CHECK-NEXT: hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_PRIV_ARG]] realloc ! CHECK-NEXT: } ! RUN: %flang -c -emit-llvm -fopenmp -mmlir --openmp-enable-delayed-privatization \ diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 index 1d291b2ac0feb..3d641a0d69689 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 @@ -42,7 +42,7 @@ subroutine delayed_privatization_private_1d(var1, l1, u1) ! ONE_DIM-NEXT: } copy { ! ONE_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]): -! ONE_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs +! ONE_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] ! ONE_DIM-NEXT: omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]]) ! ONE_DIM-NEXT: } @@ -75,7 +75,7 @@ subroutine delayed_privatization_private_2d(var1, l1, u1, l2, u2) ! TWO_DIM-NEXT: } copy { ! TWO_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]): -! TWO_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs +! TWO_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] ! TWO_DIM-NEXT: omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]]) ! TWO_DIM-NEXT: } diff --git a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 index 0fb81d68016a4..119f77ea26626 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 @@ -23,7 +23,7 @@ subroutine delayed_privatization_firstprivate ! CHECK: } copy { ! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref, %[[PRIV_PRIV_ARG:.*]]: !fir.ref): ! CHECK: %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref -! CHECK: hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : i32, !fir.ref ! CHECK: omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref) ! CHECK: } diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90 index fd38ca6b2f6f2..925677469847e 100644 --- a/flang/test/Lower/OpenMP/implicit-dsa.f90 +++ b/flang/test/Lower/OpenMP/implicit-dsa.f90 @@ -17,7 +17,7 @@ !CHECK-NEXT: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test1Ex"} !CHECK-NEXT: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test1Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK-NOT: fir.alloca !CHECK: } !CHECK: omp.task { @@ -44,7 +44,7 @@ subroutine implicit_dsa_test1 !CHECK: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test2Ex"} !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: } subroutine implicit_dsa_test2 @@ -76,7 +76,7 @@ subroutine implicit_dsa_test2 !CHECK: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test3Ex"} !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK: hlfir.assign %[[ONE]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 @@ -115,11 +115,11 @@ subroutine implicit_dsa_test3 !CHECK-NEXT: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"} !CHECK-NEXT: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[PRIV2_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test4Ez"} !CHECK-NEXT: %[[PRIV2_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[PRIV_Z_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref !CHECK: %[[ZERO:.*]] = arith.constant 0 : i32 !CHECK-NEXT: hlfir.assign %[[ZERO]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 @@ -129,11 +129,11 @@ subroutine implicit_dsa_test3 !CHECK-NEXT: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"} !CHECK-NEXT: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[PRIV2_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test4Ey"} !CHECK-NEXT: %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK-NEXT: hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ZERO:.*]] = arith.constant 0 : i32 @@ -166,7 +166,7 @@ subroutine implicit_dsa_test4 !CHECK: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test5Ex"} !CHECK: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: omp.parallel { !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK: hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref @@ -197,15 +197,15 @@ subroutine implicit_dsa_test5 !CHECK-NEXT: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"} !CHECK-NEXT: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test6Ey"} !CHECK-NEXT: %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[PRIV_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test6Ez"} !CHECK-NEXT: %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP3:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 : i32, !fir.ref !CHECK: omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV2_X:.*]] : {{.*}}, {{.*}} %{{.*}}#0 -> %[[PRIV2_Y:.*]] : {{.*}}) { !CHECK: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NOT: hlfir.assign @@ -244,11 +244,11 @@ subroutine implicit_dsa_test6 !CHECK-NEXT: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"} !CHECK-NEXT: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"} !CHECK-NEXT: %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: } subroutine implicit_dsa_test7 diff --git a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 index 41bbb182aade2..0dce4d514bd10 100644 --- a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 +++ b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 @@ -1,18 +1,6 @@ ! RUN: %flang_fc1 -emit-hlfir -o - -fopenmp %s | FileCheck %s ! RUN: bbc -emit-hlfir -o - -fopenmp %s | FileCheck %s -program lastprivate_allocatable - integer, allocatable :: a - integer :: i - ! a is unallocated here - !$omp parallel do lastprivate(a) - do i=1,1 - a = 42 - enddo - !$omp end parallel do - ! a should be allocated here -end program - ! CHECK-LABEL: func.func @_QQmain() ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box> {bindc_name = "a", uniq_name = "_QFEa"} ! CHECK: %[[VAL_1:.*]] = fir.zero_bits !fir.heap @@ -32,7 +20,50 @@ program lastprivate_allocatable ! store loop IV ! CHECK: fir.store %{{.*}} to %[[VAL_18]]#1 : !fir.ref ! assign private variable to original copy: realloc -! CHECK: hlfir.assign %[[VAL_16]]#0 to %[[VAL_3]]#0 realloc : !fir.ref>>, !fir.ref>> +! CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref>> +! CHECK: %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box>) -> !fir.heap +! CHECK: %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.heap +! CHECK: hlfir.assign %[[VAL_25]] to %[[VAL_3]]#0 realloc : i32, !fir.ref>> ! CHECK-NEXT: } ! CHECK-NEXT: omp.yield ! CHECK-NEXT: } +program lastprivate_allocatable + integer, allocatable :: a + integer :: i + ! a is unallocated here + !$omp parallel do lastprivate(a) + do i=1,1 + a = 42 + enddo + !$omp end parallel do + ! a should be allocated here +end program + +! CHECK-LABEL: func @_QPlastprivate_realloc() +! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFlastprivate_reallocEa"} : +! CHECK-SAME: (!fir.ref>>>>) -> +! CHECK-SAME: (!fir.ref>>>>, !fir.ref>>>>) +! CHECK: omp.parallel { +! CHECK: %[[A_PRIV:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFlastprivate_reallocEa"} : +! CHECK-SAME: (!fir.ref>>>>) -> +! CHECK-SAME: (!fir.ref>>>>, !fir.ref>>>>) +! CHECK: omp.sections { +! CHECK: omp.section { +! CHECK: %[[TEMP:.*]] = fir.load %[[A_PRIV:.*]]#0 : !fir.ref>>>> +! CHECK: hlfir.assign %[[TEMP]] to %[[A]]#0 realloc : !fir.box>>>, +! CHECK-SAME: !fir.ref>>>> +! CHECK: } +! CHECK: } +! CHECK: } +subroutine lastprivate_realloc() + complex, allocatable :: a(:) + + allocate(a(2)) + !$omp parallel + !$omp sections lastprivate(a) + !$omp section + deallocate(a) + allocate(a(3)) + !$omp end sections + !$omp end parallel +end subroutine diff --git a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 index f823bf6c56ae3..dcba34b2da8ef 100644 --- a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 +++ b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 @@ -26,9 +26,9 @@ !CHECK: fir.if %[[LAST_ITER]] { !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref !CHECK: %[[PRIVATE_X_VAL:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[PRIVATE_X_VAL]] to %[[X_DECL]]#0 temporary_lhs : f32, !fir.ref +!CHECK: hlfir.assign %[[PRIVATE_X_VAL]] to %[[X_DECL]]#0 : f32, !fir.ref !CHECK: %[[PRIVATE_Y_VAL:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[PRIVATE_Y_VAL]] to %[[Y_DECL]]#0 temporary_lhs : f32, !fir.ref +!CHECK: hlfir.assign %[[PRIVATE_Y_VAL]] to %[[Y_DECL]]#0 : f32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } diff --git a/flang/test/Lower/OpenMP/lastprivate-iv.f90 b/flang/test/Lower/OpenMP/lastprivate-iv.f90 index 21a701441cb56..609192471eae1 100644 --- a/flang/test/Lower/OpenMP/lastprivate-iv.f90 +++ b/flang/test/Lower/OpenMP/lastprivate-iv.f90 @@ -24,7 +24,7 @@ !CHECK: fir.if %[[CMP]] { !CHECK: fir.store %[[V]] to %[[I]]#1 : !fir.ref !CHECK: %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref -!CHECK: hlfir.assign %[[I_VAL]] to %[[I2]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[I_VAL]] to %[[I2]]#0 : i32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } @@ -62,7 +62,7 @@ subroutine lastprivate_iv_inc() !CHECK: fir.if %[[CMP]] { !CHECK: fir.store %[[V]] to %[[I]]#1 : !fir.ref !CHECK: %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref -!CHECK: hlfir.assign %[[I_VAL]] to %[[I2]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[I_VAL]] to %[[I2]]#0 : i32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } @@ -88,7 +88,7 @@ subroutine lastprivate_iv_i1 !CHECK: %[[I8_VAL:.*]] = fir.convert %{{.*}} : (i32) -> i8 !CHECK: fir.store %[[I8_VAL]] to %[[IV:.*]]#1 : !fir.ref !CHECK: %[[IV_VAL:.*]] = fir.load %[[IV]]#0 : !fir.ref -!CHECK: hlfir.assign %[[IV_VAL]] to %{{.*}}#0 temporary_lhs : i8, !fir.ref +!CHECK: hlfir.assign %[[IV_VAL]] to %{{.*}}#0 : i8, !fir.ref !CHECK: } !$omp do lastprivate(i1) do i1=1,8 diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 index be0cc4195c280..62bc247a1456a 100644 --- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 +++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 @@ -37,8 +37,8 @@ !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref ! Testing lastprivate val update -!CHECK: hlfir.assign %[[ARG1_PVT_DECL]]#0 to %[[ARG1_DECL]]#0 temporary_lhs : !fir.ref>, !fir.ref> -!CHECK: } +!CHECK: hlfir.assign %[[ARG1_PVT_DECL]]#0 to %[[ARG1_DECL]]#0 : !fir.ref>, !fir.ref> +!CHECK: } !CHECK: omp.yield !CHECK: } !CHECK: omp.terminator @@ -76,7 +76,7 @@ subroutine lastprivate_character(arg1) ! Testing lastprivate val update !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } @@ -118,9 +118,9 @@ subroutine lastprivate_int(arg1) !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref ! Testing lastprivate val update !CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref -!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 : i32, !fir.ref !CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref -!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } @@ -163,9 +163,9 @@ subroutine mult_lastprivate_int(arg1, arg2) !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref !Testing lastprivate val update !CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref -!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 : i32, !fir.ref !CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref -!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: omp.yield !CHECK: } @@ -194,7 +194,7 @@ subroutine mult_lastprivate_int2(arg1, arg2) !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}} !CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref ! Lastprivate Allocation !CHECK: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2", pinned, {{.*}}} !CHECK: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -213,7 +213,7 @@ subroutine mult_lastprivate_int2(arg1, arg2) !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref ! Testing lastprivate val update !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG2_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: } !CHECK-NEXT: omp.yield !CHECK-NEXT: } @@ -242,7 +242,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2) !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}} !CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: %[[IV:.*]] = fir.alloca i32 {bindc_name = "n", pinned, {{.*}}} !CHECK-NEXT: hlfir.declare %[[IV]] @@ -261,7 +261,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2) !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref ! Testing lastprivate val update !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref !CHECK-NEXT: } !CHECK-NEXT: omp.yield !CHECK-NEXT: } diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 index 33dab125b3b2d..de3f42be10482 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 @@ -15,7 +15,7 @@ subroutine omp_do_firstprivate(a) ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_firstprivateEa"} ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref - ! CHECK-NEXT: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK-NEXT: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -52,12 +52,12 @@ subroutine omp_do_firstprivate2(a, n) ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, {{.*}}} ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_firstprivate2En"} ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[LD1:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[LD1]] to %[[N_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[LD1]] to %[[N_PVT_DECL]]#0 : i32, !fir.ref ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 index 9b90ac28f693c..254aeff21d06e 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 @@ -34,7 +34,7 @@ subroutine omp_do_lastprivate(a) ! CHECK: fir.if %[[SEL]] { ! CHECK: fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref ! CHECK: %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref ! CHECK: } ! CHECK-NEXT: omp.yield @@ -84,9 +84,9 @@ subroutine omp_do_lastprivate2(a, n) ! CHECK: fir.if %[[SEL]] { ! CHECK: fir.store %[[NEXT_ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref ! CHECK: %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref ! CHECK: %[[N_PVT_LOAD:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[N_PVT_LOAD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[N_PVT_LOAD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref ! CHECK: } ! CHECK: omp.yield @@ -143,7 +143,7 @@ subroutine omp_do_lastprivate_collapse2(a) ! CHECK: fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref ! CHECK: fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref ! CHECK: %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref ! CHECK: } ! CHECK-NEXT: omp.yield @@ -219,7 +219,7 @@ subroutine omp_do_lastprivate_collapse3(a) ! CHECK: fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref ! CHECK: fir.store %[[NEXT_ARG3]] to %[[K_PVT_DECL]]#1 : !fir.ref ! CHECK: %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref ! CHECK: } ! CHECK-NEXT: omp.yield diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90 index 5853d07c46c7d..de1b8f4bc7d04 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90 @@ -100,7 +100,7 @@ subroutine parallel_do_with_privatisation_clauses(cond,nt) ! CHECK: %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} ! CHECK: %[[PRIVATE_NT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref - ! CHECK: hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 temporary_lhs : i32, !fir.ref + ! CHECK: hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 : i32, !fir.ref ! CHECK: %[[WS_LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[WS_UB:.*]] = arith.constant 9 : i32 ! CHECK: %[[WS_STEP:.*]] = arith.constant 1 : i32 @@ -250,7 +250,7 @@ end subroutine parallel_do_private ! CHECK: %[[NT_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"} ! CHECK: %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref -! CHECK: hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 : i32, !fir.ref ! CHECK: %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} ! CHECK: %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -298,12 +298,12 @@ end subroutine omp_parallel_do_multiple_firstprivate ! CHECK: %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} ! CHECK: %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref -! CHECK: hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 : i32, !fir.ref ! CHECK: %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} ! CHECK: %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref -! CHECK: hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref +! CHECK: hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 : i32, !fir.ref ! CHECK: %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} ! CHECK: %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90 index c79d6b020f3f5..2287918528635 100644 --- a/flang/test/Lower/OpenMP/sections.f90 +++ b/flang/test/Lower/OpenMP/sections.f90 @@ -83,7 +83,7 @@ end program sample !CHECK: %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"} !CHECK: %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 temporary_lhs : f32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 : f32, !fir.ref !CHECK: omp.sections { !CHECK: omp.section { !CHECK: omp.terminator @@ -138,7 +138,7 @@ subroutine lastprivate() !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[TEMP1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } !$omp section @@ -150,7 +150,7 @@ subroutine lastprivate() !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"} !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: omp.barrier !CHECK: omp.sections { !$omp sections firstprivate(x) lastprivate(x) @@ -169,7 +169,7 @@ subroutine lastprivate() !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } !$omp section @@ -181,7 +181,7 @@ subroutine lastprivate() !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"} !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: omp.barrier !CHECK: omp.sections nowait { !$omp sections firstprivate(x) lastprivate(x) @@ -200,7 +200,7 @@ subroutine lastprivate() !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } !$omp section @@ -221,7 +221,7 @@ subroutine lastprivate() !CHECK: %[[RESULT:.*]] = arith.addi %[[INNER_PRIVATE_X]], %[[CONST]] : i32 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[LOADED_VALUE:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[LOADED_VALUE]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[LOADED_VALUE]] to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } !CHECK: omp.terminator @@ -247,9 +247,9 @@ subroutine lastprivate() !CHECK: omp.sections { !CHECK: omp.section { !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: %[[TEMP2:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP2]] to %[[Y_DECL]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP2]] to %[[Y_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } !CHECK: omp.terminator @@ -289,7 +289,7 @@ subroutine unstructured_sections_privatization() !CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"} !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFunstructured_sections_privatizationEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : f32, !fir.ref +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : f32, !fir.ref !CHECK: omp.sections { !CHECK: omp.section { !CHECK: cf.br ^bb1 diff --git a/flang/test/Lower/OpenMP/single.f90 b/flang/test/Lower/OpenMP/single.f90 index 2e5d05dd7b472..9fd34344eb43b 100644 --- a/flang/test/Lower/OpenMP/single.f90 +++ b/flang/test/Lower/OpenMP/single.f90 @@ -84,7 +84,7 @@ end subroutine single_allocate ! CHECK: %[[Y_PVT:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatizationEy"} ! CHECK: %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[Y_LOAD:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref -! CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 temporary_lhs : f64, !fir.ref +! CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 : f64, !fir.ref ! CHECK: fir.call @_QPbar(%[[X_PVT_DECL]]#1, %[[Y_PVT_DECL]]#1) fastmath : (!fir.ref, !fir.ref) -> () ! CHECK: omp.terminator ! CHECK: } @@ -112,7 +112,7 @@ subroutine single_privatization(x, y) ! CHECK: %[[Y_PVT:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatization2Ey"} ! CHECK: %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[Y_LOAD:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref -! CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 temporary_lhs : f64, !fir.ref +! CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 : f64, !fir.ref ! CHECK: fir.call @_QPbar(%[[X_PVT_DECL]]#1, %[[Y_PVT_DECL]]#1) fastmath : (!fir.ref, !fir.ref) -> () ! CHECK: omp.terminator ! CHECK: } diff --git a/flang/test/Lower/OpenMP/statement-function.f90 b/flang/test/Lower/OpenMP/statement-function.f90 index 6cdbcb6e141c7..fd6f5986bb072 100644 --- a/flang/test/Lower/OpenMP/statement-function.f90 +++ b/flang/test/Lower/OpenMP/statement-function.f90 @@ -26,10 +26,10 @@ subroutine test_implicit_use() !CHECK: omp.task !CHECK: %[[PRIV_IEXP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiexp"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP0:.*]] = fir.load %[[IEXP]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP0]] to %[[PRIV_IEXP]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP0]] to %[[PRIV_IEXP]]#0 : i32, !fir.ref !CHECK: %[[PRIV_IIMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiimp"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TEMP1:.*]] = fir.load %[[IIMP]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP1]] to %[[PRIV_IIMP]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[TEMP1]] to %[[PRIV_IIMP]]#0 : i32, !fir.ref subroutine test_implicit_use2() implicit none integer :: iexp, iimp diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90 index afbe2cbfa746e..28e438d342d35 100644 --- a/flang/test/Lower/OpenMP/task.f90 +++ b/flang/test/Lower/OpenMP/task.f90 @@ -201,10 +201,10 @@ subroutine task_firstprivate !CHECK: %[[INT_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_firstprivateEint_var"} !CHECK: %[[INT_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[INT_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[INT_VAR_LOAD:.+]] = fir.load %[[INT_VAR]]#0 : !fir.ref -!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 : i32, !fir.ref !CHECK: %[[MYTYPE_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_firstprivateEmytype_var"} !CHECK: %[[MYTYPE_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) -!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 temporary_lhs : !fir.ref>, !fir.ref> +!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 : !fir.ref>, !fir.ref> call baz(int_var, mytype_var) !CHECK: omp.terminator !$omp end task @@ -235,7 +235,7 @@ subroutine task_multiple_clauses() !CHECK: %[[Y_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFtask_multiple_clausesEy"} !CHECK: %[[Y_PRIV:.+]]:2 = hlfir.declare %[[Y_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[Y_LOAD:.+]] = fir.load %[[Y]]#0 : !fir.ref -!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 temporary_lhs : i32, !fir.ref +!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 : i32, !fir.ref !CHECK: arith.addi x = x + 12 diff --git a/flang/test/Lower/OpenMP/task2.f90 b/flang/test/Lower/OpenMP/task2.f90 index ce491d95e9397..cff9ebdd375b3 100644 --- a/flang/test/Lower/OpenMP/task2.f90 +++ b/flang/test/Lower/OpenMP/task2.f90 @@ -18,12 +18,9 @@ subroutine omp_task_nested_allocatable_firstprivate !CHECK-SAME: uniq_name = "_QFomp_task_nested_allocatable_firstprivateEa"} : !CHECK-SAME: (!fir.ref>>>) -> !CHECK-SAME: (!fir.ref>>>, !fir.ref>>>) -!CHECK: %[[PRIV_A_BOX:.*]] = fir.load %[[PRIV_A]]#0 : !fir.ref>>> -!CHECK: fir.if %{{.*}} { -!CHECK: %[[TEMP:.*]] = fir.load %[[A]]#0 : !fir.ref>>> -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_A_BOX]] temporary_lhs : -!CHECK-SAME: !fir.box>>, !fir.box>> -!CHECK: } +!CHECK: %[[TEMP:.*]] = fir.load %[[A]]#0 : !fir.ref>>> +!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_A]]#0 realloc : +!CHECK-SAME: !fir.box>>, !fir.ref>>> !$omp task default(firstprivate) a = 2 !CHECK: } From 4d819daab91f54b90365927ba4b40e5a2eff26a9 Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Thu, 5 Sep 2024 10:58:37 -0700 Subject: [PATCH 269/425] [compiler-rt] Simplify definition of uptr We can rely on the compiler-provided macro __UINTPTR_TYPE__ for all non-MSVC compilers. I verified via https://godbolt.org/z/MW9KMjv5f that this works for MSVC as well as GCC 4.5 Clang 3.0, so that should cover all supported compilers. This means we no longer need to explicitly handle new architectures and as an added bonus also adds support for architectures where `unsigned long long` cannot be used to hold pointers (e.g. CHERI). Reviewed By: mstorsjo, vitalybuka Pull Request: https://github.com/llvm/llvm-project/pull/106155 --- .../sanitizer_common/sanitizer_internal_defs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index fefe28e811767..f8f03454ea169 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -138,19 +138,19 @@ // in a portable way by the language itself. namespace __sanitizer { -#if defined(_WIN64) +#if defined(__UINTPTR_TYPE__) +typedef __UINTPTR_TYPE__ uptr; +typedef __INTPTR_TYPE__ sptr; +#elif defined(_WIN64) // 64-bit Windows uses LLP64 data model. typedef unsigned long long uptr; typedef signed long long sptr; -#else -# if (SANITIZER_WORDSIZE == 64) || SANITIZER_APPLE -typedef unsigned long uptr; -typedef signed long sptr; -# else +#elif defined(_WIN32) typedef unsigned int uptr; typedef signed int sptr; -# endif -#endif // defined(_WIN64) +#else +# error Unsupported compiler, missing __UINTPTR_TYPE__ +#endif // defined(__UINTPTR_TYPE__) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use // 64-bit pointer to unwind stack frame. From b2dbcf4dc1078fd62ef2295ff9696173a9991116 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 11:43:19 -0700 Subject: [PATCH 270/425] [Presburger] Avoid repeated hash lookups (NFC) (#107426) --- mlir/lib/Analysis/Presburger/IntegerRelation.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 87204d2100713..94af81f955e5a 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -2367,10 +2367,8 @@ bool IntegerRelation::removeDuplicateConstraints() { hashTable.insert({row, 0}); for (unsigned k = 1; k < ineqs; ++k) { row = getInequality(k).drop_back(); - if (!hashTable.contains(row)) { - hashTable.insert({row, k}); + if (hashTable.try_emplace(row, k).second) continue; - } // For identical cases, keep only the smaller part of the constant term. unsigned l = hashTable[row]; From 7cf18ff22b626efb0dad6eb9daebea821faff438 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 11:43:36 -0700 Subject: [PATCH 271/425] [LLVMIR] Avoid repeated hash lookups (NFC) (#107428) --- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index bd76165053488..d1732cb808928 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1021,9 +1021,8 @@ ModuleImport::getConstantsToConvert(llvm::Constant *constant) { llvm::Constant *current = workList.back(); // Collect all dependencies of the current constant and add them to the // adjacency list if none has been computed before. - auto adjacencyIt = adjacencyLists.find(current); - if (adjacencyIt == adjacencyLists.end()) { - adjacencyIt = adjacencyLists.try_emplace(current).first; + auto [adjacencyIt, inserted] = adjacencyLists.try_emplace(current); + if (inserted) { // Add all constant operands to the adjacency list and skip any other // values such as basic block addresses. for (llvm::Value *operand : current->operands()) From 92e75c095bb380039f32218534f78c4580bf76e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 5 Sep 2024 11:44:12 -0700 Subject: [PATCH 272/425] Reland [flang][cuda] Add c_devptr and bypass output semantic check (#107353) Add a builtin type for c_devptr since it will need some special handling for some function like c_f_pointer. `c_ptr` is defined as a builtin type and was raising a semantic error if you try to use it in a I/O statement. This patch add a check for c_ptr and c_devptr to bypass the semantic check and allow the variables of these types to be used in I/O. This version of the patch keeps the semantic error when -pedantic is enabled to align with gfortran. --- flang/include/flang/Common/Fortran-features.h | 2 +- flang/lib/Semantics/check-io.cpp | 6 ++++++ flang/module/__fortran_builtins.f90 | 4 ++++ flang/test/Lower/CUDA/cuda-devptr.cuf | 16 ++++++++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h index 6ef5f44c89db0..0c8a3d2bd5281 100644 --- a/flang/include/flang/Common/Fortran-features.h +++ b/flang/include/flang/Common/Fortran-features.h @@ -51,7 +51,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize, NonBindCInteroperability, CudaManaged, CudaUnified, PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy, - UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram) + UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp index 54e8e09cbf7e4..46f07842b92cb 100644 --- a/flang/lib/Semantics/check-io.cpp +++ b/flang/lib/Semantics/check-io.cpp @@ -1171,6 +1171,12 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type, "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US, derived.name()); } + if ((IsBuiltinDerivedType(&derived, "c_ptr") || + IsBuiltinDerivedType(&derived, "c_devptr")) && + !context_.ShouldWarn(common::LanguageFeature::PrintCptr)) { + // Bypass the check below for c_ptr and c_devptr. + return nullptr; + } if (const Symbol * bad{FindInaccessibleComponent(which, derived, scope)}) { return &context_.Say(where, diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90 index 44b0f17339cd9..a9d3ac897eb58 100644 --- a/flang/module/__fortran_builtins.f90 +++ b/flang/module/__fortran_builtins.f90 @@ -102,6 +102,10 @@ __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, & __builtin_gridDim integer, parameter, public :: __builtin_warpsize = 32 + + type, public, bind(c) :: __builtin_c_devptr + type(__builtin_c_ptr) :: cptr + end type intrinsic :: __builtin_fma intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, & diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf new file mode 100644 index 0000000000000..4e11e3c0fc8f8 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -0,0 +1,16 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran specific type + +subroutine sub1() + use iso_c_binding + use __fortran_builtins, only : c_devptr => __builtin_c_devptr + + type(c_ptr) :: ptr + type(c_devptr) :: dptr + print*,ptr + print*,dptr +end + +! CHECK-LABEL: func.func @_QPsub1() +! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType From d219c63b16851ba264b6495e3f63016d1c8b2aac Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Thu, 5 Sep 2024 20:45:31 +0200 Subject: [PATCH 273/425] [Clang] Fix crash with `source_location` in lambda declarators. (#107411) Parsing lambdas require pushing a declaration context for the lambda, so that parameters can be attached to it, before its trailing type is parsed. DAt that point, partially-parsed lambda don't have a name that can be computed for then. This would cause source_location::current() to crash when use in the decltype of a lambda(). We work around this by producing a source_location for an enclosing scope in that scenario. Fixes #67134 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/Expr.cpp | 10 ++++++++++ clang/test/SemaCXX/source_location.cpp | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ebd0b7371e1be..a2e91fd648cce 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -369,6 +369,7 @@ Bug Fixes to C++ Support - Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048) - Fixed a bug where defaulted comparison operators would remove ``const`` from base classes. (#GH102588) +- Fix a crash when using ``source_location`` in the trailing return type of a lambda expression. (#GH67134) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 96c6276f3f34c..27930db019a17 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -13,6 +13,7 @@ #include "clang/AST/Expr.h" #include "clang/AST/APValue.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTLambda.h" #include "clang/AST/Attr.h" #include "clang/AST/ComputeDependence.h" #include "clang/AST/DeclCXX.h" @@ -2287,6 +2288,15 @@ APValue SourceLocExpr::EvaluateInContext(const ASTContext &Ctx, Context = getParentContext(); } + // If we are currently parsing a lambda declarator, we might not have a fully + // formed call operator declaration yet, and we could not form a function name + // for it. Because we do not have access to Sema/function scopes here, we + // detect this case by relying on the fact such method doesn't yet have a + // type. + if (const auto *D = dyn_cast(Context); + D && D->getFunctionTypeLoc().isNull() && isLambdaCallOperator(D)) + Context = D->getParent()->getParent(); + PresumedLoc PLoc = Ctx.getSourceManager().getPresumedLoc( Ctx.getSourceManager().getExpansionRange(Loc).getEnd()); diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp index 34177bfe287fc..8b3a5d8dd3327 100644 --- a/clang/test/SemaCXX/source_location.cpp +++ b/clang/test/SemaCXX/source_location.cpp @@ -989,3 +989,26 @@ void Test() { } #endif + + +namespace GH67134 { +template +constexpr auto f(std::source_location loc2 = std::source_location::current()) { return loc; } + +int g = []() -> decltype(f()) { return 0; }(); + +int call() { +#if __cplusplus >= 202002L + return []() -> decltype(f()) { return 0; }(); +#endif + return []() -> decltype(f()) { return 0; }(); +} + +#if __cplusplus >= 202002L +template +int Var = requires { []() -> decltype(f()){}; }; +int h = Var; +#endif + + +} From 18926666f509104c3f478444b282291ce19fab6a Mon Sep 17 00:00:00 2001 From: SJW <48454132+sjw36@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:46:18 -0500 Subject: [PATCH 274/425] [MLIR][SCF] Loop pipelining fails on failed predication (no assert) (#107442) The SCFLoopPipelining allows predication on peeled or loop ops. When the predicationFn returns a nullptr this signifies the op type is unsupported and the pipeliner fails except in `emitPrologue` where it asserts. This patch fixes handling in the prologue to gracefully fail. --- mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp index a34542f0161ac..7cecd4942b640 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp @@ -77,7 +77,7 @@ struct LoopPipelinerInternal { bool initializeLoopInfo(ForOp op, const PipeliningOption &options); /// Emits the prologue, this creates `maxStage - 1` part which will contain /// operations from stages [0; i], where i is the part index. - void emitPrologue(RewriterBase &rewriter); + LogicalResult emitPrologue(RewriterBase &rewriter); /// Gather liverange information for Values that are used in a different stage /// than its definition. llvm::MapVector analyzeCrossStageValues(); @@ -263,7 +263,7 @@ cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op, return clone; } -void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { +LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { // Initialize the iteration argument to the loop initial values. for (auto [arg, operand] : llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) { @@ -311,7 +311,8 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { if (predicates[predicateIdx]) { OpBuilder::InsertionGuard insertGuard(rewriter); newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]); - assert(newOp && "failed to predicate op."); + if (newOp == nullptr) + return failure(); } if (annotateFn) annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); @@ -339,6 +340,7 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { } } } + return success(); } llvm::MapVector @@ -772,7 +774,8 @@ FailureOr mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp, *modifiedIR = true; // 1. Emit prologue. - pipeliner.emitPrologue(rewriter); + if (failed(pipeliner.emitPrologue(rewriter))) + return failure(); // 2. Track values used across stages. When a value cross stages it will // need to be passed as loop iteration arguments. From 3f1d0e1b1dfef0af0ca5f3315317246d0026fb70 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Thu, 5 Sep 2024 11:53:22 -0700 Subject: [PATCH 275/425] [flang] Silence warning in module file (#107421) Most warnings should be silenced when processing the content of a module file, since the warning should have also appeared when the module file was generated. The case of an intrinsic type kind not being supported for a target wasn't being suppressed; fix. Fixes https://github.com/llvm/llvm-project/issues/107337. --- flang/lib/Semantics/expression.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 60db02dc764b4..3684839c187e6 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -4027,7 +4027,8 @@ bool ExpressionAnalyzer::CheckIntrinsicKind( return true; } else if (foldingContext_.targetCharacteristics().CanSupportType( category, kind)) { - if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget)) { + if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget) && + !context_.IsInModuleFile(GetContextualMessages().at())) { Say("%s(KIND=%jd) is not an enabled type for this target"_warn_en_US, ToUpperCase(EnumToString(category)), kind); } From 5e1e6a689c82aaf2b7af72e074c95889a11d3a78 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 11:58:03 -0700 Subject: [PATCH 276/425] [TableGen] Avoid repeated hash lookups (NFC) (#107429) --- mlir/lib/TableGen/Pattern.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp index afb69e7cc5586..1be0e744ffbc8 100644 --- a/mlir/lib/TableGen/Pattern.cpp +++ b/mlir/lib/TableGen/Pattern.cpp @@ -137,11 +137,10 @@ llvm::StringRef DagNode::getSymbol() const { return node->getNameStr(); } Operator &DagNode::getDialectOp(RecordOperatorMap *mapper) const { llvm::Record *opDef = cast(node->getOperator())->getDef(); - auto it = mapper->find(opDef); - if (it != mapper->end()) - return *it->second; - return *mapper->try_emplace(opDef, std::make_unique(opDef)) - .first->second; + auto [it, inserted] = mapper->try_emplace(opDef); + if (inserted) + it->second = std::make_unique(opDef); + return *it->second; } int DagNode::getNumOps() const { From a0dd90eb7dc318c9b3fccb9ba02e1e22fb073094 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 12:19:31 -0700 Subject: [PATCH 277/425] =?UTF-8?q?[lldb]=20Make=20conversions=20from=20ll?= =?UTF-8?q?vm::Error=20explicit=20with=20Status::FromEr=E2=80=A6=20(#10716?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ror() [NFC] --- lldb/include/lldb/Utility/Status.h | 7 +- lldb/source/API/SBDebugger.cpp | 2 +- lldb/source/API/SBTarget.cpp | 2 +- .../Commands/CommandObjectBreakpoint.cpp | 103 ++++++++++-------- .../Commands/CommandObjectMemoryTag.cpp | 10 +- lldb/source/Commands/CommandObjectStats.cpp | 6 +- lldb/source/Commands/CommandObjectTrace.cpp | 2 +- lldb/source/Core/PluginManager.cpp | 2 +- lldb/source/Core/ThreadedCommunication.cpp | 2 +- lldb/source/Core/ValueObjectVTable.cpp | 2 +- lldb/source/Core/ValueObjectVariable.cpp | 2 +- lldb/source/DataFormatters/VectorType.cpp | 2 +- lldb/source/Host/common/FileCache.cpp | 2 +- .../Host/common/NativeProcessProtocol.cpp | 2 +- lldb/source/Host/common/TCPSocket.cpp | 6 +- lldb/source/Host/macosx/objcxx/Host.mm | 2 +- .../posix/ConnectionFileDescriptorPosix.cpp | 6 +- lldb/source/Interpreter/CommandObject.cpp | 2 +- lldb/source/Interpreter/OptionValueRegex.cpp | 2 +- .../Language/CPlusPlus/BlockPointer.cpp | 2 +- .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 2 +- .../Minidump/ObjectFileMinidump.cpp | 2 +- ...latformiOSSimulatorCoreSimulatorSupport.mm | 2 +- .../Process/Linux/NativeProcessLinux.cpp | 16 +-- .../NativeRegisterContextDBReg_arm64.cpp | 10 +- .../Process/elf-core/ProcessElfCore.cpp | 2 +- .../gdb-remote/GDBRemoteCommunication.cpp | 2 +- .../GDBRemoteCommunicationClient.cpp | 2 +- .../GDBRemoteCommunicationServer.cpp | 2 +- .../GDBRemoteCommunicationServerLLGS.cpp | 8 +- .../GDBRemoteCommunicationServerPlatform.cpp | 2 +- .../Process/minidump/ProcessMinidump.cpp | 2 +- .../Interfaces/ScriptedPythonInterface.h | 2 +- .../Python/PythonDataObjects.cpp | 22 ++-- .../Python/ScriptInterpreterPython.cpp | 10 +- .../DarwinLog/StructuredDataDarwinLog.cpp | 2 +- lldb/source/Target/ModuleCache.cpp | 2 +- lldb/source/Target/Platform.cpp | 2 +- lldb/source/Target/Process.cpp | 4 +- lldb/source/Target/StackFrame.cpp | 2 +- lldb/source/Target/Thread.cpp | 3 +- lldb/source/Utility/Scalar.cpp | 2 +- lldb/source/Utility/Status.cpp | 8 +- lldb/source/Utility/StructuredData.cpp | 2 +- .../Host/NativeProcessTestUtils.h | 4 +- lldb/unittests/Utility/StatusTest.cpp | 14 ++- 46 files changed, 157 insertions(+), 140 deletions(-) diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index b304291ffae00..3813a3c160470 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -91,9 +91,9 @@ class Status { ~Status(); - // llvm::Error support - explicit Status(llvm::Error error) { *this = std::move(error); } - const Status &operator=(llvm::Error error); + /// Avoid using this in new code. Migrate APIs to llvm::Expected instead. + static Status FromError(llvm::Error error); + /// FIXME: Replace this with a takeError method. llvm::Error ToError() const; /// Get the error string associated with the current error. @@ -145,6 +145,7 @@ class Status { bool Success() const; protected: + Status(llvm::Error error); /// Status code as an integer value. ValueType m_code = 0; /// The type of the above error code. diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index 72501570320d5..c226acc15018e 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -220,7 +220,7 @@ lldb::SBError SBDebugger::InitializeWithErrorHandling() { SBError error; if (auto e = g_debugger_lifetime->Initialize( std::make_unique(), LoadPlugin)) { - error.SetError(Status(std::move(e))); + error.SetError(Status::FromError(std::move(e))); } return error; } diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index e927cb854cd88..41eb77e5506bc 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -1658,7 +1658,7 @@ SBError SBTarget::SetLabel(const char *label) { if (!target_sp) return Status::FromErrorString("Couldn't get internal target object."); - return Status(target_sp->SetLabel(label)); + return Status::FromError(target_sp->SetLabel(label)); } uint32_t SBTarget::GetDataByteSize() { diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp index ede3dd2f2a864..494d6c50e94ac 100644 --- a/lldb/source/Commands/CommandObjectBreakpoint.cpp +++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp @@ -89,14 +89,16 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { if (success) m_bp_opts.SetAutoContinue(value); else - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); } break; case 'i': { uint32_t ignore_count; if (option_arg.getAsInteger(0, ignore_count)) - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message)); else m_bp_opts.SetIgnoreCount(ignore_count); } break; @@ -106,29 +108,31 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { if (success) { m_bp_opts.SetOneShot(value); } else - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); } break; case 't': { lldb::tid_t thread_id = LLDB_INVALID_THREAD_ID; if (option_arg == "current") { if (!execution_context) { - error = CreateOptionParsingError( + error = Status::FromError(CreateOptionParsingError( option_arg, short_option, long_option, - "No context to determine current thread"); + "No context to determine current thread")); } else { ThreadSP ctx_thread_sp = execution_context->GetThreadSP(); if (!ctx_thread_sp || !ctx_thread_sp->IsValid()) { - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - "No currently selected thread"); + "No currently selected thread")); } else { thread_id = ctx_thread_sp->GetID(); } } } else if (option_arg.getAsInteger(0, thread_id)) { - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message)); } if (thread_id != LLDB_INVALID_THREAD_ID) m_bp_opts.SetThreadID(thread_id); @@ -142,8 +146,9 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { case 'x': { uint32_t thread_index = UINT32_MAX; if (option_arg.getAsInteger(0, thread_index)) { - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message)); } else { m_bp_opts.GetThreadSpec()->SetIndex(thread_index); } @@ -286,9 +291,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { case 'u': if (option_arg.getAsInteger(0, m_column)) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + g_int_parsing_error_message)); break; case 'E': { @@ -326,8 +331,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { error_context = "Unsupported language type for exception breakpoint"; } if (!error_context.empty()) - error = CreateOptionParsingError(option_arg, short_option, - long_option, error_context); + error = Status::FromError(CreateOptionParsingError( + option_arg, short_option, long_option, error_context)); } break; case 'f': @@ -343,9 +348,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { bool success; m_catch_bp = OptionArgParser::ToBoolean(option_arg, true, &success); if (!success) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + g_bool_parsing_error_message)); } break; case 'H': @@ -362,24 +367,24 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { m_skip_prologue = eLazyBoolNo; if (!success) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + g_bool_parsing_error_message)); } break; case 'l': if (option_arg.getAsInteger(0, m_line_num)) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + g_int_parsing_error_message)); break; case 'L': m_language = Language::GetLanguageTypeFromString(option_arg); if (m_language == eLanguageTypeUnknown) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_language_parsing_error_message); + g_language_parsing_error_message)); break; case 'm': { @@ -392,9 +397,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { m_move_to_nearest_code = eLazyBoolNo; if (!success) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + g_bool_parsing_error_message)); break; } @@ -412,8 +417,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { if (BreakpointID::StringIsBreakpointName(option_arg, error)) m_breakpoint_names.push_back(std::string(option_arg)); else - error = CreateOptionParsingError( - option_arg, short_option, long_option, "Invalid breakpoint name"); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + "Invalid breakpoint name")); break; } @@ -451,9 +457,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { bool success; m_throw_bp = OptionArgParser::ToBoolean(option_arg, true, &success); if (!success) - error = + error = Status::FromError( CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + g_bool_parsing_error_message)); } break; case 'X': @@ -465,8 +471,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { OptionValueFileColonLine value; Status fcl_err = value.SetValueFromString(option_arg); if (!fcl_err.Success()) { - error = CreateOptionParsingError(option_arg, short_option, - long_option, fcl_err.AsCString()); + error = Status::FromError(CreateOptionParsingError( + option_arg, short_option, long_option, fcl_err.AsCString())); } else { m_filenames.AppendIfUnique(value.GetFileSpec()); m_line_num = value.GetLineNumber(); @@ -1551,13 +1557,15 @@ class BreakpointNameOptionGroup : public OptionGroup { break; case 'B': if (m_breakpoint.SetValueFromString(option_arg).Fail()) - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_int_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message)); break; case 'D': if (m_use_dummy.SetValueFromString(option_arg).Fail()) - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); break; case 'H': m_help_string.SetValueFromString(option_arg); @@ -1610,8 +1618,9 @@ class BreakpointAccessOptionGroup : public OptionGroup { if (success) { m_permissions.SetAllowList(value); } else - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); } break; case 'A': { bool value, success; @@ -1619,8 +1628,9 @@ class BreakpointAccessOptionGroup : public OptionGroup { if (success) { m_permissions.SetAllowDisable(value); } else - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); } break; case 'D': { bool value, success; @@ -1628,8 +1638,9 @@ class BreakpointAccessOptionGroup : public OptionGroup { if (success) { m_permissions.SetAllowDelete(value); } else - error = CreateOptionParsingError(option_arg, short_option, long_option, - g_bool_parsing_error_message); + error = Status::FromError( + CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message)); } break; default: llvm_unreachable("Unimplemented option"); @@ -2113,8 +2124,8 @@ class CommandObjectBreakpointRead : public CommandObjectParsed { Status name_error; if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(option_arg), name_error)) { - error = CreateOptionParsingError(option_arg, short_option, - long_option, name_error.AsCString()); + error = Status::FromError(CreateOptionParsingError( + option_arg, short_option, long_option, name_error.AsCString())); } m_names.push_back(std::string(option_arg)); break; diff --git a/lldb/source/Commands/CommandObjectMemoryTag.cpp b/lldb/source/Commands/CommandObjectMemoryTag.cpp index f45d6eacab3d0..bc76319018da9 100644 --- a/lldb/source/Commands/CommandObjectMemoryTag.cpp +++ b/lldb/source/Commands/CommandObjectMemoryTag.cpp @@ -77,7 +77,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed { process->GetMemoryTagManager(); if (!tag_manager_or_err) { - result.SetError(Status(tag_manager_or_err.takeError())); + result.SetError(Status::FromError(tag_manager_or_err.takeError())); return; } @@ -102,7 +102,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed { tag_manager->MakeTaggedRange(start_addr, end_addr, memory_regions); if (!tagged_range) { - result.SetError(Status(tagged_range.takeError())); + result.SetError(Status::FromError(tagged_range.takeError())); return; } @@ -110,7 +110,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed { tagged_range->GetRangeBase(), tagged_range->GetByteSize()); if (!tags) { - result.SetError(Status(tags.takeError())); + result.SetError(Status::FromError(tags.takeError())); return; } @@ -230,7 +230,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed { process->GetMemoryTagManager(); if (!tag_manager_or_err) { - result.SetError(Status(tag_manager_or_err.takeError())); + result.SetError(Status::FromError(tag_manager_or_err.takeError())); return; } @@ -282,7 +282,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed { memory_regions); if (!tagged_range) { - result.SetError(Status(tagged_range.takeError())); + result.SetError(Status::FromError(tagged_range.takeError())); return; } diff --git a/lldb/source/Commands/CommandObjectStats.cpp b/lldb/source/Commands/CommandObjectStats.cpp index 53855e7d03165..7d333afc231ba 100644 --- a/lldb/source/Commands/CommandObjectStats.cpp +++ b/lldb/source/Commands/CommandObjectStats.cpp @@ -87,21 +87,21 @@ class CommandObjectStatsDump : public CommandObjectParsed { OptionArgParser::ToBoolean("--targets", option_arg)) m_stats_options.SetIncludeTargets(*bool_or_error); else - error = bool_or_error.takeError(); + error = Status::FromError(bool_or_error.takeError()); break; case 'm': if (llvm::Expected bool_or_error = OptionArgParser::ToBoolean("--modules", option_arg)) m_stats_options.SetIncludeModules(*bool_or_error); else - error = bool_or_error.takeError(); + error = Status::FromError(bool_or_error.takeError()); break; case 't': if (llvm::Expected bool_or_error = OptionArgParser::ToBoolean("--transcript", option_arg)) m_stats_options.SetIncludeTranscript(*bool_or_error); else - error = bool_or_error.takeError(); + error = Status::FromError(bool_or_error.takeError()); break; default: llvm_unreachable("Unimplemented option"); diff --git a/lldb/source/Commands/CommandObjectTrace.cpp b/lldb/source/Commands/CommandObjectTrace.cpp index 5bcbc236301cc..5e212e05461a6 100644 --- a/lldb/source/Commands/CommandObjectTrace.cpp +++ b/lldb/source/Commands/CommandObjectTrace.cpp @@ -361,7 +361,7 @@ class CommandObjectTraceSchema : public CommandObjectParsed { Trace::FindPluginSchema(plugin_name)) result.AppendMessage(*schemaOrErr); else - error = schemaOrErr.takeError(); + error = Status::FromError(schemaOrErr.takeError()); } if (error.Success()) { diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp index fd5cb792c101a..a5219025495a9 100644 --- a/lldb/source/Core/PluginManager.cpp +++ b/lldb/source/Core/PluginManager.cpp @@ -723,7 +723,7 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp, llvm::Expected ret = process_sp->SaveCore(options.GetOutputFile()->GetPath()); if (!ret) - return Status(ret.takeError()); + return Status::FromError(ret.takeError()); if (ret.get()) return Status(); } diff --git a/lldb/source/Core/ThreadedCommunication.cpp b/lldb/source/Core/ThreadedCommunication.cpp index d8b567c9bd0de..649ce71c29374 100644 --- a/lldb/source/Core/ThreadedCommunication.cpp +++ b/lldb/source/Core/ThreadedCommunication.cpp @@ -178,7 +178,7 @@ bool ThreadedCommunication::StartReadThread(Status *error_ptr) { m_read_thread = *maybe_thread; } else { if (error_ptr) - *error_ptr = Status(maybe_thread.takeError()); + *error_ptr = Status::FromError(maybe_thread.takeError()); else { LLDB_LOG_ERROR(GetLog(LLDBLog::Host), maybe_thread.takeError(), "failed to launch host thread: {0}"); diff --git a/lldb/source/Core/ValueObjectVTable.cpp b/lldb/source/Core/ValueObjectVTable.cpp index 66e0750b63f8f..e38f0a83df994 100644 --- a/lldb/source/Core/ValueObjectVTable.cpp +++ b/lldb/source/Core/ValueObjectVTable.cpp @@ -220,7 +220,7 @@ bool ValueObjectVTable::UpdateValue() { llvm::Expected vtable_info_or_err = language_runtime->GetVTableInfo(*parent, /*check_type=*/true); if (!vtable_info_or_err) { - m_error = vtable_info_or_err.takeError(); + m_error = Status::FromError(vtable_info_or_err.takeError()); return false; } diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index 01f871a6f8bc4..29aefb270c92c 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -249,7 +249,7 @@ bool ValueObjectVariable::UpdateValue() { SetValueIsValid(m_error.Success()); } else { - m_error = maybe_value.takeError(); + m_error = Status::FromError(maybe_value.takeError()); // could not find location, won't allow editing m_resolved_value.SetContext(Value::ContextType::Invalid, nullptr); } diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp index 19de204c24353..f6c38f76fea31 100644 --- a/lldb/source/DataFormatters/VectorType.cpp +++ b/lldb/source/DataFormatters/VectorType.cpp @@ -233,7 +233,7 @@ class VectorTypeSyntheticFrontEnd : public SyntheticChildrenFrontEnd { auto num_children_or_err = CalculateNumChildren(); if (!num_children_or_err) return ValueObjectConstResult::Create( - nullptr, Status(num_children_or_err.takeError())); + nullptr, Status::FromError(num_children_or_err.takeError())); if (idx >= *num_children_or_err) return {}; std::optional size = m_child_type.GetByteSize(nullptr); diff --git a/lldb/source/Host/common/FileCache.cpp b/lldb/source/Host/common/FileCache.cpp index 4ac198171f96c..87c90a03fd98d 100644 --- a/lldb/source/Host/common/FileCache.cpp +++ b/lldb/source/Host/common/FileCache.cpp @@ -32,7 +32,7 @@ lldb::user_id_t FileCache::OpenFile(const FileSpec &file_spec, } auto file = FileSystem::Instance().Open(file_spec, flags, mode); if (!file) { - error = file.takeError(); + error = Status::FromError(file.takeError()); return UINT64_MAX; } lldb::user_id_t fd = file.get()->GetDescriptor(); diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp index d3b9dde368db0..a84d8db1c8794 100644 --- a/lldb/source/Host/common/NativeProcessProtocol.cpp +++ b/lldb/source/Host/common/NativeProcessProtocol.cpp @@ -350,7 +350,7 @@ Status NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr, } auto expected_bkpt = EnableSoftwareBreakpoint(addr, size_hint); if (!expected_bkpt) - return Status(expected_bkpt.takeError()); + return Status::FromError(expected_bkpt.takeError()); m_software_breakpoints.emplace(addr, std::move(*expected_bkpt)); return Status(); diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp index 4f1518ef697ff..b28ba148ee1af 100644 --- a/lldb/source/Host/common/TCPSocket.cpp +++ b/lldb/source/Host/common/TCPSocket.cpp @@ -137,7 +137,7 @@ Status TCPSocket::Connect(llvm::StringRef name) { Status error; llvm::Expected host_port = DecodeHostAndPort(name); if (!host_port) - return Status(host_port.takeError()); + return Status::FromError(host_port.takeError()); std::vector addresses = SocketAddress::GetAddressInfo(host_port->hostname.c_str(), nullptr, @@ -176,7 +176,7 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) { Status error; llvm::Expected host_port = DecodeHostAndPort(name); if (!host_port) - return Status(host_port.takeError()); + return Status::FromError(host_port.takeError()); if (host_port->hostname == "*") host_port->hostname = "0.0.0.0"; @@ -287,7 +287,7 @@ Status TCPSocket::Accept(Socket *&conn_socket) { accept_loop.RequestTermination(); }); if (!expected_handles) - return Status(expected_handles.takeError()); + return Status::FromError(expected_handles.takeError()); return accept_loop.Run(); } diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm index 94a2b916574c6..fe63cc16c6499 100644 --- a/lldb/source/Host/macosx/objcxx/Host.mm +++ b/lldb/source/Host/macosx/objcxx/Host.mm @@ -316,7 +316,7 @@ repeat with the_window in (get windows)\n\ unix_socket_name, [&] { return AcceptPIDFromInferior(connect_url); }); if (!accept_thread) - return Status(accept_thread.takeError()); + return Status::FromError(accept_thread.takeError()); [applescript executeAndReturnError:nil]; diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp index 6a40f66be39b1..2a2fcf00c0adf 100644 --- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp +++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp @@ -652,7 +652,7 @@ ConnectionFileDescriptor::ConnectUDP(llvm::StringRef s, Socket::UdpConnect(s, m_child_processes_inherit); if (!socket) { if (error_ptr) - *error_ptr = socket.takeError(); + *error_ptr = Status::FromError(socket.takeError()); else LLDB_LOG_ERROR(GetLog(LLDBLog::Connection), socket.takeError(), "tcp connect failed: {0}"); @@ -769,7 +769,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectSerialPort( SerialPort::OptionsFromURL(qs); if (!serial_options) { if (error_ptr) - *error_ptr = serial_options.takeError(); + *error_ptr = Status::FromError(serial_options.takeError()); else llvm::consumeError(serial_options.takeError()); return eConnectionStatusError; @@ -786,7 +786,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectSerialPort( fd, File::eOpenOptionReadWrite, serial_options.get(), true); if (!serial_sp) { if (error_ptr) - *error_ptr = serial_sp.takeError(); + *error_ptr = Status::FromError(serial_sp.takeError()); else llvm::consumeError(serial_sp.takeError()); return eConnectionStatusError; diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp index c819024ccf018..cf2682cd26faa 100644 --- a/lldb/source/Interpreter/CommandObject.cpp +++ b/lldb/source/Interpreter/CommandObject.cpp @@ -121,7 +121,7 @@ bool CommandObject::ParseOptions(Args &args, CommandReturnObject &result) { args = std::move(*args_or); error = options->NotifyOptionParsingFinished(&exe_ctx); } else - error = args_or.takeError(); + error = Status::FromError(args_or.takeError()); if (error.Success()) { if (options->VerifyOptions(result)) diff --git a/lldb/source/Interpreter/OptionValueRegex.cpp b/lldb/source/Interpreter/OptionValueRegex.cpp index d810df503f589..91ec41df6ee50 100644 --- a/lldb/source/Interpreter/OptionValueRegex.cpp +++ b/lldb/source/Interpreter/OptionValueRegex.cpp @@ -51,7 +51,7 @@ Status OptionValueRegex::SetValueFromString(llvm::StringRef value, m_value_was_set = true; NotifyValueChanged(); } else if (llvm::Error err = m_regex.GetError()) { - return Status(std::move(err)); + return Status::FromError(std::move(err)); } else { return Status::FromErrorString("regex error"); } diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp index 2c9b3c425397a..f3c137d99703b 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp @@ -114,7 +114,7 @@ class BlockPointerSyntheticFrontEnd : public SyntheticChildrenFrontEnd { if (!child_type_or_err) return ValueObjectConstResult::Create( exe_ctx.GetBestExecutionContextScope(), - Status(child_type_or_err.takeError())); + Status::FromError(child_type_or_err.takeError())); CompilerType child_type = *child_type_or_err; ValueObjectSP struct_pointer_sp = diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 2004622e547be..b28beab117cca 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6822,7 +6822,7 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp, outfile, File::eOpenOptionWriteOnly | File::eOpenOptionTruncate | File::eOpenOptionCanCreate); if (!core_file) { - error = core_file.takeError(); + error = Status::FromError(core_file.takeError()); } else { // Read 1 page at a time uint8_t bytes[0x1000]; diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp index 0897895e6bc25..5da69dd4f2ce7 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp @@ -70,7 +70,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp, options.GetOutputFile().value(), File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate); if (!maybe_core_file) { - error = maybe_core_file.takeError(); + error = Status::FromError(maybe_core_file.takeError()); return false; } MinidumpFileBuilder builder(std::move(maybe_core_file.get()), process_sp, diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm index 2825db6e3a6b5..303a5409c6fe4 100644 --- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm +++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm @@ -408,7 +408,7 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info, launch_info.GetPTY().GetSecondaryFileDescriptor(); if (secondary_fd == PseudoTerminal::invalid_fd) { if (llvm::Error Err = launch_info.GetPTY().OpenSecondary(O_RDWR)) - return Status(std::move(Err)); + return Status::FromError(std::move(Err)); } secondary_fd = launch_info.GetPTY().GetSecondaryFileDescriptor(); assert(secondary_fd != PseudoTerminal::invalid_fd); diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp index 1e2e3a80b18bf..cc0e34eecdf30 100644 --- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp @@ -1287,7 +1287,7 @@ Status NativeProcessLinux::PopulateMemoryRegionCache() { return true; } - Result = Info.takeError(); + Result = Status::FromError(Info.takeError()); m_supports_mem_region = LazyBool::eLazyBoolNo; LLDB_LOG(log, "failed to parse proc maps: {0}", Result); return false; @@ -1475,7 +1475,7 @@ Status NativeProcessLinux::ReadMemoryTags(int32_t type, lldb::addr_t addr, llvm::Expected details = GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type); if (!details) - return Status(details.takeError()); + return Status::FromError(details.takeError()); // Ignore 0 length read if (!len) @@ -1530,7 +1530,7 @@ Status NativeProcessLinux::WriteMemoryTags(int32_t type, lldb::addr_t addr, llvm::Expected details = GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type); if (!details) - return Status(details.takeError()); + return Status::FromError(details.takeError()); // Ignore 0 length write if (!len) @@ -1547,18 +1547,18 @@ Status NativeProcessLinux::WriteMemoryTags(int32_t type, lldb::addr_t addr, llvm::Expected> unpacked_tags_or_err = details->manager->UnpackTagsData(tags); if (!unpacked_tags_or_err) - return Status(unpacked_tags_or_err.takeError()); + return Status::FromError(unpacked_tags_or_err.takeError()); llvm::Expected> repeated_tags_or_err = details->manager->RepeatTagsForRange(*unpacked_tags_or_err, range); if (!repeated_tags_or_err) - return Status(repeated_tags_or_err.takeError()); + return Status::FromError(repeated_tags_or_err.takeError()); // Repack them for ptrace to use llvm::Expected> final_tag_data = details->manager->PackTags(*repeated_tags_or_err); if (!final_tag_data) - return Status(final_tag_data.takeError()); + return Status::FromError(final_tag_data.takeError()); struct iovec tags_vec; uint8_t *src = final_tag_data->data(); @@ -1790,7 +1790,7 @@ void NativeProcessLinux::NotifyTracersProcessWillResume() { Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) { Log *log = GetLog(POSIXLog::Thread); - Status error(m_intel_pt_collector.OnThreadCreated(tid)); + Status error = Status::FromError(m_intel_pt_collector.OnThreadCreated(tid)); if (error.Fail()) LLDB_LOG(log, "Failed to trace a new thread with intel-pt, tid = {0}. {1}", tid, error.AsCString()); @@ -1799,7 +1799,7 @@ Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) { Status NativeProcessLinux::NotifyTracersOfThreadDestroyed(lldb::tid_t tid) { Log *log = GetLog(POSIXLog::Thread); - Status error(m_intel_pt_collector.OnThreadDestroyed(tid)); + Status error = Status::FromError(m_intel_pt_collector.OnThreadDestroyed(tid)); if (error.Fail()) LLDB_LOG(log, "Failed to stop a destroyed thread with intel-pt, tid = {0}. {1}", diff --git a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp index 4bec3de586685..f1d0756b3ed9c 100644 --- a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp +++ b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp @@ -172,7 +172,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareBreakpoints() { // Read hardware breakpoint and watchpoint information. llvm::Error error = ReadHardwareDebugInfo(); if (error) - return Status(std::move(error)); + return Status::FromError(std::move(error)); for (uint32_t i = 0; i < m_max_hbp_supported; i++) { if (BreakpointIsEnabled(i)) { @@ -191,7 +191,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareBreakpoints() { m_hbp_regs[i].control = tempControl; m_hbp_regs[i].address = tempAddr; - return Status(std::move(error)); + return Status::FromError(std::move(error)); } } } @@ -356,7 +356,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareWatchpoints() { // Read hardware breakpoint and watchpoint information. llvm::Error error = ReadHardwareDebugInfo(); if (error) - return Status(std::move(error)); + return Status::FromError(std::move(error)); for (uint32_t i = 0; i < m_max_hwp_supported; i++) { if (WatchpointIsEnabled(i)) { @@ -375,7 +375,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareWatchpoints() { m_hwp_regs[i].control = tempControl; m_hwp_regs[i].address = tempAddr; - return Status(std::move(error)); + return Status::FromError(std::move(error)); } } } @@ -420,7 +420,7 @@ Status NativeRegisterContextDBReg_arm64::GetWatchpointHitIndex( // Read hardware breakpoint and watchpoint information. llvm::Error error = ReadHardwareDebugInfo(); if (error) - return Status(std::move(error)); + return Status::FromError(std::move(error)); // Mask off ignored bits from watchpoint trap address. trap_addr = FixWatchpointHitAddress(trap_addr); diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp index 0e8407fc46edf..7955594bf5d94 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp @@ -194,7 +194,7 @@ Status ProcessElfCore::DoLoadCore() { // Parse thread contexts and auxv structure if (H.p_type == llvm::ELF::PT_NOTE) { if (llvm::Error error = ParseThreadContextsFromNoteSegment(H, data)) - return Status(std::move(error)); + return Status::FromError(std::move(error)); } // PT_LOAD segments contains address map if (H.p_type == llvm::ELF::PT_LOAD) { diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp index 50fa11e916c5f..1f1e511346879 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp @@ -847,7 +847,7 @@ Status GDBRemoteCommunication::StartListenThread(const char *hostname, llvm::Expected listen_thread = ThreadLauncher::LaunchThread( listen_url, [this] { return GDBRemoteCommunication::ListenThread(); }); if (!listen_thread) - return Status(listen_thread.takeError()); + return Status::FromError(listen_thread.takeError()); m_listen_thread = *listen_thread; return Status(); diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index 55d76ca8532d3..f6b1db7b8bbbc 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -1745,7 +1745,7 @@ Status GDBRemoteCommunicationClient::LoadQXferMemoryMap() { llvm::Expected xml = ReadExtFeature("memory-map", ""); if (!xml) - return Status(xml.takeError()); + return Status::FromError(xml.takeError()); XMLDocument xml_document; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp index 9d08a5d3411f1..9b72cb0035282 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp @@ -123,7 +123,7 @@ GDBRemoteCommunicationServer::SendErrorResponse(llvm::Error error) { [&](std::unique_ptr E) { EIB = std::move(E); }); if (EIB) - return SendErrorResponse(Status(llvm::Error(std::move(EIB)))); + return SendErrorResponse(Status::FromError(llvm::Error(std::move(EIB)))); return SendUnimplementedResponse(""); } diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp index 504c994f980cf..35fa93e53bc66 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp @@ -277,7 +277,7 @@ Status GDBRemoteCommunicationServerLLGS::LaunchProcess() { // lldb-server on Windows. #if !defined(_WIN32) if (llvm::Error Err = m_process_launch_info.SetUpPtyRedirection()) - return Status(std::move(Err)); + return Status::FromError(std::move(Err)); #endif } @@ -287,7 +287,7 @@ Status GDBRemoteCommunicationServerLLGS::LaunchProcess() { "process but one already exists"); auto process_or = m_process_manager.Launch(m_process_launch_info, *this); if (!process_or) - return Status(process_or.takeError()); + return Status::FromError(process_or.takeError()); m_continue_process = m_current_process = process_or->get(); m_debugged_processes.emplace( m_current_process->GetID(), @@ -356,7 +356,7 @@ Status GDBRemoteCommunicationServerLLGS::AttachToProcess(lldb::pid_t pid) { // Try to attach. auto process_or = m_process_manager.Attach(pid, *this); if (!process_or) { - Status status(process_or.takeError()); + Status status = Status::FromError(process_or.takeError()); llvm::errs() << llvm::formatv("failed to attach to process {0}: {1}\n", pid, status); return status; @@ -1367,7 +1367,7 @@ GDBRemoteCommunicationServerLLGS::Handle_jLLDBTraceGetBinaryData( llvm::json::parse(packet.Peek(), "TraceGetBinaryDataRequest"); if (!request) - return SendErrorResponse(Status(request.takeError())); + return SendErrorResponse(Status::FromError(request.takeError())); if (Expected> bytes = m_current_process->TraceGetBinaryData(*request)) { diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp index 30e782e3be184..2f2750ec2b920 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp @@ -167,7 +167,7 @@ Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer( if (available_port) port = *available_port; else - return Status(available_port.takeError()); + return Status::FromError(available_port.takeError()); } // Spawn a new thread to accept the port that gets bound after binding to diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index 7a326a557547d..ac1ecbfc0e2e7 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -186,7 +186,7 @@ void ProcessMinidump::Terminate() { Status ProcessMinidump::DoLoadCore() { auto expected_parser = MinidumpParser::Create(m_core_data); if (!expected_parser) - return Status(expected_parser.takeError()); + return Status::FromError(expected_parser.takeError()); m_minidump_parser = std::move(*expected_parser); Status error; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index cbb6cd41aa867..c1dcdc7df6cee 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -269,7 +269,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { transformed_args); if (llvm::Error e = expected_return_object.takeError()) { - error = Status(std::move(e)); + error = Status::FromError(std::move(e)); return ErrorWithMessage(caller_signature, "Python method could not be called.", error); } diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp index ce14b531ea29c..24cf343000632 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp @@ -1105,7 +1105,7 @@ template class OwnedPythonFile : public Base { if (!m_borrowed) { auto r = m_py_obj.CallMethod("close"); if (!r) - py_error = Status(r.takeError()); + py_error = Status::FromError(r.takeError()); } base_error = Base::Close(); if (py_error.Fail()) @@ -1196,7 +1196,7 @@ class PythonIOFile : public OwnedPythonFile { return Flush(); auto r = m_py_obj.CallMethod("close"); if (!r) - return Status(r.takeError()); + return Status::FromError(r.takeError()); return Status(); } @@ -1204,7 +1204,7 @@ class PythonIOFile : public OwnedPythonFile { GIL takeGIL; auto r = m_py_obj.CallMethod("flush"); if (!r) - return Status(r.takeError()); + return Status::FromError(r.takeError()); return Status(); } @@ -1240,12 +1240,12 @@ class BinaryPythonFile : public PythonIOFile { PyObject *pybuffer_p = PyMemoryView_FromMemory( const_cast((const char *)buf), num_bytes, PyBUF_READ); if (!pybuffer_p) - return Status(llvm::make_error()); + return Status::FromError(llvm::make_error()); auto pybuffer = Take(pybuffer_p); num_bytes = 0; auto bytes_written = As(m_py_obj.CallMethod("write", pybuffer)); if (!bytes_written) - return Status(bytes_written.takeError()); + return Status::FromError(bytes_written.takeError()); if (bytes_written.get() < 0) return Status::FromErrorString( ".write() method returned a negative number!"); @@ -1260,7 +1260,7 @@ class BinaryPythonFile : public PythonIOFile { auto pybuffer_obj = m_py_obj.CallMethod("read", (unsigned long long)num_bytes); if (!pybuffer_obj) - return Status(pybuffer_obj.takeError()); + return Status::FromError(pybuffer_obj.takeError()); num_bytes = 0; if (pybuffer_obj.get().IsNone()) { // EOF @@ -1269,7 +1269,7 @@ class BinaryPythonFile : public PythonIOFile { } auto pybuffer = PythonBuffer::Create(pybuffer_obj.get()); if (!pybuffer) - return Status(pybuffer.takeError()); + return Status::FromError(pybuffer.takeError()); memcpy(buf, pybuffer.get().get().buf, pybuffer.get().get().len); num_bytes = pybuffer.get().get().len; return Status(); @@ -1295,12 +1295,12 @@ class TextPythonFile : public PythonIOFile { auto pystring = PythonString::FromUTF8(llvm::StringRef((const char *)buf, num_bytes)); if (!pystring) - return Status(pystring.takeError()); + return Status::FromError(pystring.takeError()); num_bytes = 0; auto bytes_written = As(m_py_obj.CallMethod("write", pystring.get())); if (!bytes_written) - return Status(bytes_written.takeError()); + return Status::FromError(bytes_written.takeError()); if (bytes_written.get() < 0) return Status::FromErrorString( ".write() method returned a negative number!"); @@ -1321,14 +1321,14 @@ class TextPythonFile : public PythonIOFile { auto pystring = As( m_py_obj.CallMethod("read", (unsigned long long)num_chars)); if (!pystring) - return Status(pystring.takeError()); + return Status::FromError(pystring.takeError()); if (pystring.get().IsNone()) { // EOF return Status(); } auto stringref = pystring.get().AsUTF8(); if (!stringref) - return Status(stringref.takeError()); + return Status::FromError(stringref.takeError()); num_bytes = stringref.get().size(); memcpy(buf, stringref.get().begin(), num_bytes); return Status(); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 76f2640a3ea69..63691d24f0dad 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1110,7 +1110,7 @@ Status ScriptInterpreterPythonImpl::ExecuteMultipleLines( options.GetEnableIO(), m_debugger, /*result=*/nullptr); if (!io_redirect_or_error) - return Status(io_redirect_or_error.takeError()); + return Status::FromError(io_redirect_or_error.takeError()); ScriptInterpreterIORedirect &io_redirect = **io_redirect_or_error; @@ -1144,7 +1144,7 @@ Status ScriptInterpreterPythonImpl::ExecuteMultipleLines( E.Restore(); return error; }); - return Status(std::move(error)); + return Status::FromError(std::move(error)); } return Status(); @@ -2393,7 +2393,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( exc_options.GetEnableIO(), m_debugger, /*result=*/nullptr); if (!io_redirect_or_error) { - error = io_redirect_or_error.takeError(); + error = Status::FromError(io_redirect_or_error.takeError()); return false; } @@ -2435,7 +2435,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( if (extra_search_dir) { if (llvm::Error e = ExtendSysPath(extra_search_dir.GetPath())) { - error = std::move(e); + error = Status::FromError(std::move(e)); return false; } } else { @@ -2465,7 +2465,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( } if (llvm::Error e = ExtendSysPath(module_file.GetDirectory().GetCString())) { - error = std::move(e); + error = Status::FromError(std::move(e)); return false; } module_name = module_file.GetFilename().GetCString(); diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp index 1137f24451d28..4ca8bd2f9085d 100644 --- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp +++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp @@ -289,7 +289,7 @@ class RegexFilterRule : public FilterRule { // Instantiate the regex so we can report any errors. auto regex = RegularExpression(op_arg); if (llvm::Error err = regex.GetError()) { - error = Status(std::move(err)); + error = Status::FromError(std::move(err)); return FilterRuleSP(); } diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp index ce009f9b2fafe..ccae7ea106c97 100644 --- a/lldb/source/Target/ModuleCache.cpp +++ b/lldb/source/Target/ModuleCache.cpp @@ -166,7 +166,7 @@ ModuleLock::ModuleLock(const FileSpec &root_dir_spec, const UUID &uuid, m_file_up = std::move(file.get()); else { m_file_up.reset(); - error = Status(file.takeError()); + error = Status::FromError(file.takeError()); return; } diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp index b65a27dedc081..7792edcc2cb58 100644 --- a/lldb/source/Target/Platform.cpp +++ b/lldb/source/Target/Platform.cpp @@ -1131,7 +1131,7 @@ Status Platform::PutFile(const FileSpec &source, const FileSpec &destination, auto source_file = FileSystem::Instance().Open(source, source_open_options, lldb::eFilePermissionsUserRW); if (!source_file) - return Status(source_file.takeError()); + return Status::FromError(source_file.takeError()); Status error; bool requires_upload = true; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 97ce2c14458e9..3d7ddbe294a49 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -6486,13 +6486,13 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len, llvm::Expected tag_manager_or_err = GetMemoryTagManager(); if (!tag_manager_or_err) - return Status(tag_manager_or_err.takeError()); + return Status::FromError(tag_manager_or_err.takeError()); const MemoryTagManager *tag_manager = *tag_manager_or_err; llvm::Expected> packed_tags = tag_manager->PackTags(tags); if (!packed_tags) { - return Status(packed_tags.takeError()); + return Status::FromError(packed_tags.takeError()); } return DoWriteMemoryTags(addr, len, tag_manager->GetAllocationTagType(), diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index e35a4c318d358..1610971a34148 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1104,7 +1104,7 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) { m_sc.function->GetFrameBaseExpression().Evaluate( &exe_ctx, nullptr, loclist_base_addr, nullptr, nullptr); if (!expr_value) - m_frame_base_error = expr_value.takeError(); + m_frame_base_error = Status::FromError(expr_value.takeError()); else m_frame_base = expr_value->ResolveValue(&exe_ctx); } else { diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 899e822851d81..902fbb2b519ef 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -2078,7 +2078,8 @@ lldb::ValueObjectSP Thread::GetSiginfoValue() { llvm::Expected> data = GetSiginfo(*type_size); if (!data) - return ValueObjectConstResult::Create(&target, Status(data.takeError())); + return ValueObjectConstResult::Create(&target, + Status::FromError(data.takeError())); DataExtractor data_extractor{data.get()->getBufferStart(), data.get()->getBufferSize(), process_sp->GetByteOrder(), arch.GetAddressByteSize()}; diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index 098128b1a50e5..329f5b6e4b9a5 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -684,7 +684,7 @@ Status Scalar::SetValueFromCString(const char *value_str, Encoding encoding, m_type = e_float; m_float = std::move(f); } else - error = op.takeError(); + error = Status::FromError(op.takeError()); break; } diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index 7260b7b3e0a03..131fc662bfc0a 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -55,10 +55,10 @@ Status::Status(std::string err_str) : m_code(LLDB_GENERIC_ERROR), m_type(eErrorTypeGeneric), m_string(std::move(err_str)) {} -const Status &Status::operator=(llvm::Error error) { +Status::Status(llvm::Error error) { if (!error) { Clear(); - return *this; + return; } // if the error happens to be a errno error, preserve the error code @@ -79,8 +79,6 @@ const Status &Status::operator=(llvm::Error error) { m_type = eErrorTypeGeneric; m_string = llvm::toString(std::move(error)); } - - return *this; } Status Status::FromErrorStringWithFormat(const char *format, ...) { @@ -96,6 +94,8 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) { return Status(string); } +Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } + llvm::Error Status::ToError() const { if (Success()) return llvm::Error::success(); diff --git a/lldb/source/Utility/StructuredData.cpp b/lldb/source/Utility/StructuredData.cpp index 5f9821b979c07..fb4f6920d62eb 100644 --- a/lldb/source/Utility/StructuredData.cpp +++ b/lldb/source/Utility/StructuredData.cpp @@ -46,7 +46,7 @@ StructuredData::ParseJSONFromFile(const FileSpec &input_spec, Status &error) { json::parse(buffer_or_error.get()->getBuffer().str()); if (value) return ParseJSONValue(*value); - error = Status(value.takeError()); + error = Status::FromError(value.takeError()); return StructuredData::ObjectSP(); } diff --git a/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h b/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h index a610b37a6b38e..1a017122411a8 100644 --- a/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h +++ b/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h @@ -75,7 +75,7 @@ template class MockProcess : public T { auto ExpectedMemory = this->ReadMemory(Addr, Size); if (!ExpectedMemory) { BytesRead = 0; - return Status(ExpectedMemory.takeError()); + return Status::FromError(ExpectedMemory.takeError()); } BytesRead = ExpectedMemory->size(); assert(BytesRead <= Size); @@ -89,7 +89,7 @@ template class MockProcess : public T { Addr, llvm::ArrayRef(static_cast(Buf), Size)); if (!ExpectedBytes) { BytesWritten = 0; - return Status(ExpectedBytes.takeError()); + return Status::FromError(ExpectedBytes.takeError()); } BytesWritten = *ExpectedBytes; return Status(); diff --git a/lldb/unittests/Utility/StatusTest.cpp b/lldb/unittests/Utility/StatusTest.cpp index d33909ea89727..be4f2beebcdb5 100644 --- a/lldb/unittests/Utility/StatusTest.cpp +++ b/lldb/unittests/Utility/StatusTest.cpp @@ -27,21 +27,20 @@ TEST(StatusTest, Formatv) { } TEST(StatusTest, ErrorConstructor) { - EXPECT_TRUE(Status(llvm::Error::success()).Success()); + EXPECT_TRUE(Status::FromError(llvm::Error::success()).Success()); - Status eagain( + Status eagain = Status::FromError( llvm::errorCodeToError(std::error_code(EAGAIN, std::generic_category()))); EXPECT_TRUE(eagain.Fail()); EXPECT_EQ(eErrorTypePOSIX, eagain.GetType()); EXPECT_EQ(Status::ValueType(EAGAIN), eagain.GetError()); - Status foo(llvm::make_error( - "foo", llvm::inconvertibleErrorCode())); + Status foo = Status::FromError(llvm::createStringError("foo")); EXPECT_TRUE(foo.Fail()); EXPECT_EQ(eErrorTypeGeneric, foo.GetType()); EXPECT_STREQ("foo", foo.AsCString()); - foo = llvm::Error::success(); + foo = Status::FromError(llvm::Error::success()); EXPECT_TRUE(foo.Success()); } @@ -52,6 +51,11 @@ TEST(StatusTest, ErrorCodeConstructor) { EXPECT_TRUE(eagain.Fail()); EXPECT_EQ(eErrorTypePOSIX, eagain.GetType()); EXPECT_EQ(Status::ValueType(EAGAIN), eagain.GetError()); + + llvm::Error list = llvm::joinErrors(llvm::createStringError("foo"), + llvm::createStringError("bar")); + Status foobar = Status::FromError(std::move(list)); + EXPECT_EQ(std::string("foo\nbar"), std::string(foobar.AsCString())); } TEST(StatusTest, ErrorConversion) { From 9df592fb806b77d5fb0c7a9d5c9057d1626587e3 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 5 Sep 2024 20:25:56 +0100 Subject: [PATCH 278/425] [AArch64] Fold away zext of extract of uzp. (#107367) Similar to #107201, this comes up from the lowering of zext of deinterleaving shuffles. Patterns such as ext(extract_subvector(uzp(a, b))) can be converted to a simple and to perform the extract/zext from a uzp1. Uzp2 can be handled with an extra shift, and due to the existing legalization we could have and / shift between which can be combined in. Mostly this reduces instruction count or increases the amount of parallelism in the sequence. --- .../Target/AArch64/AArch64ISelLowering.cpp | 66 +++++++ llvm/test/CodeGen/AArch64/zext-shuffle.ll | 162 ++++++++---------- 2 files changed, 135 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c0671dd1f0087..c8e8a1e612e0a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22261,6 +22261,70 @@ static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT)); } +// This comes up similar to the above when lowering deinterleaving shuffles from +// zexts. We have legalized the operations in the generally case to +// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if +// the extract is to the low half and the uzp is uzp1. There would be an extra +// shift if the uzp was uzp2 to grab the upper half. Due to the combine above +// there could also be an existing and / shift that can be combined in, either +// before of after the extract. +static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (N->getOpcode() != ISD::ZERO_EXTEND || + (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned ExtOffset = (unsigned)-1; + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + ExtOffset = Op.getConstantOperandVal(1); + Op = Op.getOperand(0); + } + + unsigned Shift = 0; + APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(), + Op.getValueType().getScalarSizeInBits()); + + if (Op.getOpcode() == AArch64ISD::VLSHR) { + Shift = Op.getConstantOperandVal(1); + Op = Op.getOperand(0); + Mask = Mask.lshr(Shift); + } + if (Op.getOpcode() == ISD::AND && + ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) { + Op = Op.getOperand(0); + Mask = Mask.zext(VT.getScalarSizeInBits()); + } else if (Op.getOpcode() == AArch64ISD::BICi) { + Mask = ~APInt(Op.getValueType().getScalarSizeInBits(), + Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2)); + Mask = Mask.zext(VT.getScalarSizeInBits()); + Op = Op.getOperand(0); + } + + if (ExtOffset == (unsigned)-1) { + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + ExtOffset = Op.getConstantOperandVal(1); + Op = Op.getOperand(0); + } else + return SDValue(); + } + if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements()) + return SDValue(); + + if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2) + return SDValue(); + if (Op.getOpcode() == AArch64ISD::UZP2) + Shift += VT.getScalarSizeInBits() / 2; + + SDLoc DL(N); + SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT, + Op.getOperand(ExtOffset == 0 ? 0 : 1)); + if (Shift != 0) + BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC, + DAG.getConstant(Shift, DL, MVT::i32)); + return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT)); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -22283,6 +22347,8 @@ static SDValue performExtendCombine(SDNode *N, if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG)) return R; + if (SDValue R = performZExtUZPCombine(N, DAG)) + return R; if (N->getValueType(0).isFixedLengthVector() && N->getOpcode() == ISD::SIGN_EXTEND && diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index 6415fba29ff79..6d25c874a2893 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -90,10 +90,11 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: movi v2.2d, #0x0000000000ffff +; CHECK-NEXT: ushr v0.2d, v0.2d, #16 +; CHECK-NEXT: ushr v1.2d, v1.2d, #16 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> %z1 = zext <4 x i16> %s1 to <4 x i64> @@ -117,10 +118,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushr v0.2d, v0.2d, #48 +; CHECK-NEXT: ushr v1.2d, v1.2d, #48 ; CHECK-NEXT: ret %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> %z1 = zext <4 x i16> %s1 to <4 x i64> @@ -142,8 +141,7 @@ define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_1357: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v0.8h -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -210,8 +208,7 @@ define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_1357: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -278,8 +275,7 @@ define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) { define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i32_1357: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ret @@ -291,10 +287,9 @@ define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) { define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i32_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i32> @@ -304,10 +299,11 @@ define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) { define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i32_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ushr v0.8h, v0.8h, #8 -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: ushr v0.4s, v0.4s, #8 +; CHECK-NEXT: ushr v1.4s, v1.4s, #8 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i32> @@ -317,10 +313,10 @@ define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) { define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i32_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 +; CHECK-NEXT: ushr v1.4s, v1.4s, #16 +; CHECK-NEXT: bic v0.4s, #255, lsl #8 +; CHECK-NEXT: bic v1.4s, #255, lsl #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i32> @@ -330,10 +326,8 @@ define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) { define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i32_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ushr v0.8h, v0.8h, #8 -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushr v0.4s, v0.4s, #24 +; CHECK-NEXT: ushr v1.4s, v1.4s, #24 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i32> @@ -407,77 +401,59 @@ define <8 x i64> @zext_load_add(ptr %p) { define <8 x double> @uitofp_fadd(<32 x i16> %l) { ; CHECK-LABEL: uitofp_fadd: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi d4, #0x00ffff0000ffff -; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8 -; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8 -; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s -; CHECK-NEXT: and v17.8b, v6.8b, v4.8b -; CHECK-NEXT: and v18.8b, v7.8b, v4.8b -; CHECK-NEXT: ushr v6.2s, v6.2s, #16 -; CHECK-NEXT: ushr v7.2s, v7.2s, #16 -; CHECK-NEXT: and v21.8b, v0.8b, v4.8b -; CHECK-NEXT: and v22.8b, v2.8b, v4.8b -; CHECK-NEXT: ushr v2.2s, v2.2s, #16 -; CHECK-NEXT: and v19.8b, v16.8b, v4.8b -; CHECK-NEXT: and v20.8b, v5.8b, v4.8b -; CHECK-NEXT: ushll v3.2d, v17.2s, #0 -; CHECK-NEXT: ushll v17.2d, v18.2s, #0 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ushr v16.2s, v16.2s, #16 -; CHECK-NEXT: ushr v5.2s, v5.2s, #16 -; CHECK-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-NEXT: ushll v7.2d, v7.2s, #0 -; CHECK-NEXT: ushll v18.2d, v19.2s, #0 -; CHECK-NEXT: ushll v19.2d, v20.2s, #0 -; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ushll v21.2d, v21.2s, #0 -; CHECK-NEXT: ushll v5.2d, v5.2s, #0 -; CHECK-NEXT: ushll v22.2d, v22.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v17.2d, v17.2d -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: and v23.8b, v20.8b, v4.8b -; CHECK-NEXT: and v4.8b, v1.8b, v4.8b -; CHECK-NEXT: ushr v20.2s, v20.2s, #16 -; CHECK-NEXT: ushr v1.2s, v1.2s, #16 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: movi v4.2d, #0x0000000000ffff +; CHECK-NEXT: ushr v5.2d, v0.2d, #16 +; CHECK-NEXT: ushr v6.2d, v1.2d, #16 +; CHECK-NEXT: ushr v7.2d, v2.2d, #16 +; CHECK-NEXT: ushr v17.2d, v3.2d, #16 +; CHECK-NEXT: ushr v20.2d, v0.2d, #32 +; CHECK-NEXT: ushr v22.2d, v1.2d, #32 +; CHECK-NEXT: ushr v23.2d, v2.2d, #32 +; CHECK-NEXT: ushr v24.2d, v3.2d, #32 +; CHECK-NEXT: and v16.16b, v0.16b, v4.16b +; CHECK-NEXT: and v18.16b, v1.16b, v4.16b +; CHECK-NEXT: and v19.16b, v2.16b, v4.16b +; CHECK-NEXT: and v21.16b, v3.16b, v4.16b +; CHECK-NEXT: and v5.16b, v5.16b, v4.16b +; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v7.16b, v7.16b, v4.16b +; CHECK-NEXT: and v17.16b, v17.16b, v4.16b +; CHECK-NEXT: and v20.16b, v20.16b, v4.16b +; CHECK-NEXT: and v22.16b, v22.16b, v4.16b +; CHECK-NEXT: and v23.16b, v23.16b, v4.16b +; CHECK-NEXT: and v4.16b, v24.16b, v4.16b +; CHECK-NEXT: ushr v0.2d, v0.2d, #48 +; CHECK-NEXT: ushr v1.2d, v1.2d, #48 +; CHECK-NEXT: ushr v2.2d, v2.2d, #48 +; CHECK-NEXT: ushr v3.2d, v3.2d, #48 +; CHECK-NEXT: ucvtf v16.2d, v16.2d ; CHECK-NEXT: ucvtf v18.2d, v18.2d ; CHECK-NEXT: ucvtf v19.2d, v19.2d -; CHECK-NEXT: ucvtf v16.2d, v16.2d -; CHECK-NEXT: ushll v23.2d, v23.2s, #0 -; CHECK-NEXT: ushll v4.2d, v4.2s, #0 -; CHECK-NEXT: ushll v20.2d, v20.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ucvtf v5.2d, v5.2d ; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ucvtf v6.2d, v6.2d +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d ; CHECK-NEXT: ucvtf v22.2d, v22.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d ; CHECK-NEXT: ucvtf v23.2d, v23.2d ; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v20.2d, v20.2d +; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d -; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d -; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d -; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d -; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d -; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d -; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d -; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d +; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d +; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d +; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d +; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d +; CHECK-NEXT: fadd v0.2d, v20.2d, v0.2d +; CHECK-NEXT: fadd v1.2d, v22.2d, v1.2d +; CHECK-NEXT: fadd v3.2d, v4.2d, v3.2d +; CHECK-NEXT: fadd v2.2d, v23.2d, v2.2d +; CHECK-NEXT: fadd v0.2d, v5.2d, v0.2d +; CHECK-NEXT: fadd v1.2d, v6.2d, v1.2d ; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d -; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d -; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d +; CHECK-NEXT: fadd v3.2d, v17.2d, v3.2d ; CHECK-NEXT: ret %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> %z1 = uitofp <8 x i16> %s1 to <8 x double> From 362da640dd18e2ef960e0d2198fe8378104c4119 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 5 Sep 2024 12:32:07 -0700 Subject: [PATCH 279/425] [SandboxIR][Bench] Test RAUW (#107440) --- llvm/benchmarks/SandboxIRBench.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index 633de6db1f5e2..1a7f808af21f2 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -106,10 +106,40 @@ template static void GetType(benchmark::State &State) { benchmark::DoNotOptimize(I->getType()); } +static std::string generateRAUWIR(unsigned Size) { + std::stringstream SS; + SS << "define void @foo(i32 %v1, i32 %v2) {\n"; + SS << " %def1 = add i32 %v1, %v2\n"; + SS << " %def2 = add i32 %v1, %v2\n"; + for (auto Cnt : seq(0, Size)) + SS << " %add" << Cnt << " = add i32 %def1, %def1\n"; + SS << "ret void"; + SS << "}"; + return SS.str(); +} + +template static void RAUW(benchmark::State &State) { + LLVMContext LLVMCtx; + sandboxir::Context Ctx(LLVMCtx); + std::unique_ptr LLVMM; + unsigned NumInstrs = State.range(0); + auto *BB = genIR(LLVMM, LLVMCtx, Ctx, generateRAUWIR, NumInstrs); + auto It = BB->begin(); + auto *Def1 = &*It++; + auto *Def2 = &*It++; + for (auto _ : State) { + Def1->replaceAllUsesWith(Def2); + Def2->replaceAllUsesWith(Def1); + } +} + BENCHMARK(GetType); BENCHMARK(GetType); BENCHMARK(BBWalk)->Args({1024}); BENCHMARK(BBWalk)->Args({1024}); +BENCHMARK(RAUW)->Args({512}); +BENCHMARK(RAUW)->Args({512}); + BENCHMARK_MAIN(); From f32e5bdcefcff80f4296f8f4abedc37dcda36d53 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 5 Sep 2024 12:34:47 -0700 Subject: [PATCH 280/425] [NFC] Rename the `Nr` abbreviation to `Num` (#107151) It's more clear. (This isn't exhaustive). --- .../lib/ctx_profile/CtxInstrContextNode.h | 25 +++++----- .../lib/ctx_profile/CtxInstrProfiling.cpp | 41 +++++++-------- .../lib/ctx_profile/CtxInstrProfiling.h | 4 +- llvm/include/llvm/Analysis/MLModelRunner.h | 4 +- .../llvm/ProfileData/CtxInstrContextNode.h | 25 +++++----- .../Analysis/FunctionPropertiesAnalysis.cpp | 4 +- llvm/lib/Analysis/MLInlineAdvisor.cpp | 6 +-- llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 36 ++++++------- .../Instrumentation/PGOCtxProfLowering.cpp | 50 +++++++++---------- .../Instrumentation/PGOInstrumentation.cpp | 6 +-- .../PGOCtxProfReaderWriterTest.cpp | 8 +-- 11 files changed, 106 insertions(+), 103 deletions(-) diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h index a916f197aa148..5991458c5732d 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h @@ -68,18 +68,19 @@ using GUID = uint64_t; class ContextNode final { const GUID Guid; ContextNode *const Next; - const uint32_t NrCounters; - const uint32_t NrCallsites; + const uint32_t NumCounters; + const uint32_t NumCallsites; public: - ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites, + ContextNode(GUID Guid, uint32_t NumCounters, uint32_t NumCallsites, ContextNode *Next = nullptr) - : Guid(Guid), Next(Next), NrCounters(NrCounters), - NrCallsites(NrCallsites) {} + : Guid(Guid), Next(Next), NumCounters(NumCounters), + NumCallsites(NumCallsites) {} - static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) { - return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters + - sizeof(ContextNode *) * NrCallsites; + static inline size_t getAllocSize(uint32_t NumCounters, + uint32_t NumCallsites) { + return sizeof(ContextNode) + sizeof(uint64_t) * NumCounters + + sizeof(ContextNode *) * NumCallsites; } // The counters vector starts right after the static header. @@ -88,8 +89,8 @@ class ContextNode final { return reinterpret_cast(addr_after); } - uint32_t counters_size() const { return NrCounters; } - uint32_t callsites_size() const { return NrCallsites; } + uint32_t counters_size() const { return NumCounters; } + uint32_t callsites_size() const { return NumCallsites; } const uint64_t *counters() const { return const_cast(this)->counters(); @@ -97,7 +98,7 @@ class ContextNode final { // The subcontexts vector starts right after the end of the counters vector. ContextNode **subContexts() { - return reinterpret_cast(&(counters()[NrCounters])); + return reinterpret_cast(&(counters()[NumCounters])); } ContextNode *const *subContexts() const { @@ -107,7 +108,7 @@ class ContextNode final { GUID guid() const { return Guid; } ContextNode *next() const { return Next; } - size_t size() const { return getAllocSize(NrCounters, NrCallsites); } + size_t size() const { return getAllocSize(NumCounters, NumCallsites); } uint64_t entrycount() const { return counters()[0]; } }; diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index a0a535015bf2e..df30986cdfc69 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -92,10 +92,11 @@ bool validate(const ContextRoot *Root) { } inline ContextNode *allocContextNode(char *Place, GUID Guid, - uint32_t NrCounters, uint32_t NrCallsites, + uint32_t NumCounters, + uint32_t NumCallsites, ContextNode *Next = nullptr) { assert(reinterpret_cast(Place) % ExpectedAlignment == 0); - return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next); + return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next); } void resetContextNode(ContextNode &Node) { @@ -161,8 +162,8 @@ void Arena::freeArenaList(Arena *&A) { // If this is the first time we hit a callsite with this (Guid) particular // callee, we need to allocate. ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, - uint32_t NrCounters, uint32_t NrCallsites) { - auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites); + uint32_t NumCounters, uint32_t NumCallsites) { + auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; char *AllocPlace = Mem->tryBumpAllocate(AllocSize); if (!AllocPlace) { @@ -175,15 +176,15 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); AllocPlace = Mem->tryBumpAllocate(AllocSize); } - auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites, + auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites, *InsertionPoint); *InsertionPoint = Ret; return Ret; } ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, - uint32_t NrCounters, - uint32_t NrCallsites) { + uint32_t NumCounters, + uint32_t NumCallsites) { // fast "out" if we're not even doing contextual collection. if (!__llvm_ctx_profile_current_context_root) return TheScratchContext; @@ -222,14 +223,14 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, Callsite = Callsite->next(); } auto *Ret = Callsite ? Callsite - : getCallsiteSlow(Guid, CallsiteContext, NrCounters, - NrCallsites); - if (Ret->callsites_size() != NrCallsites || - Ret->counters_size() != NrCounters) + : getCallsiteSlow(Guid, CallsiteContext, NumCounters, + NumCallsites); + if (Ret->callsites_size() != NumCallsites || + Ret->counters_size() != NumCounters) __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", - reinterpret_cast(Ret), Guid, NrCallsites, - NrCounters, Ret->guid(), Ret->callsites_size(), + reinterpret_cast(Ret), Guid, NumCallsites, + NumCounters, Ret->guid(), Ret->callsites_size(), Ret->counters_size()); onContextEnter(*Ret); return Ret; @@ -237,19 +238,19 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, // This should be called once for a Root. Allocate the first arena, set up the // first context. -void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters, - uint32_t NrCallsites) { +void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, + uint32_t NumCallsites) { __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); // Re-check - we got here without having had taken a lock. if (Root->FirstMemBlock) return; - const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites); + const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites); auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); Root->FirstMemBlock = M; Root->CurrentMem = M; Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid, - NrCounters, NrCallsites); + NumCounters, NumCallsites); AllContextRoots.PushBack(Root); } @@ -278,7 +279,7 @@ void __llvm_ctx_profile_release_context(ContextRoot *Root) } void __llvm_ctx_profile_start_collection() { - size_t NrMemUnits = 0; + size_t NumMemUnits = 0; __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { @@ -286,11 +287,11 @@ void __llvm_ctx_profile_start_collection() { __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( &Root->Taken); for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) - ++NrMemUnits; + ++NumMemUnits; resetContextNode(*Root->FirstNode); } - __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits); + __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); } bool __llvm_ctx_profile_fetch(void *Data, diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h index f55068e98dd43..74d346d6e0a07 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -153,8 +153,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); /// called for any other function than entry points, in the entry BB of such /// function. Same consideration about LSB of returned value as .._start_context ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, - uint32_t NrCounters, - uint32_t NrCallsites); + uint32_t NumCounters, + uint32_t NumCallsites); /// Prepares for collection. Currently this resets counter values but preserves /// internal context tree structure. diff --git a/llvm/include/llvm/Analysis/MLModelRunner.h b/llvm/include/llvm/Analysis/MLModelRunner.h index 21f155de85aec..9f7f19a369d9c 100644 --- a/llvm/include/llvm/Analysis/MLModelRunner.h +++ b/llvm/include/llvm/Analysis/MLModelRunner.h @@ -54,8 +54,8 @@ class MLModelRunner { virtual void switchContext(StringRef Name) {} protected: - MLModelRunner(LLVMContext &Ctx, Kind Type, size_t NrInputs) - : Ctx(Ctx), Type(Type), InputBuffers(NrInputs) { + MLModelRunner(LLVMContext &Ctx, Kind Type, size_t NumInputs) + : Ctx(Ctx), Type(Type), InputBuffers(NumInputs) { assert(Type != Kind::Unknown); } virtual void *evaluateUntyped() = 0; diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h index a916f197aa148..5991458c5732d 100644 --- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h +++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h @@ -68,18 +68,19 @@ using GUID = uint64_t; class ContextNode final { const GUID Guid; ContextNode *const Next; - const uint32_t NrCounters; - const uint32_t NrCallsites; + const uint32_t NumCounters; + const uint32_t NumCallsites; public: - ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites, + ContextNode(GUID Guid, uint32_t NumCounters, uint32_t NumCallsites, ContextNode *Next = nullptr) - : Guid(Guid), Next(Next), NrCounters(NrCounters), - NrCallsites(NrCallsites) {} + : Guid(Guid), Next(Next), NumCounters(NumCounters), + NumCallsites(NumCallsites) {} - static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) { - return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters + - sizeof(ContextNode *) * NrCallsites; + static inline size_t getAllocSize(uint32_t NumCounters, + uint32_t NumCallsites) { + return sizeof(ContextNode) + sizeof(uint64_t) * NumCounters + + sizeof(ContextNode *) * NumCallsites; } // The counters vector starts right after the static header. @@ -88,8 +89,8 @@ class ContextNode final { return reinterpret_cast(addr_after); } - uint32_t counters_size() const { return NrCounters; } - uint32_t callsites_size() const { return NrCallsites; } + uint32_t counters_size() const { return NumCounters; } + uint32_t callsites_size() const { return NumCallsites; } const uint64_t *counters() const { return const_cast(this)->counters(); @@ -97,7 +98,7 @@ class ContextNode final { // The subcontexts vector starts right after the end of the counters vector. ContextNode **subContexts() { - return reinterpret_cast(&(counters()[NrCounters])); + return reinterpret_cast(&(counters()[NumCounters])); } ContextNode *const *subContexts() const { @@ -107,7 +108,7 @@ class ContextNode final { GUID guid() const { return Guid; } ContextNode *next() const { return Next; } - size_t size() const { return getAllocSize(NrCounters, NrCallsites); } + size_t size() const { return getAllocSize(NumCounters, NumCallsites); } uint64_t entrycount() const { return counters()[0]; } }; diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index af6574052c8c8..0ffbc90d7ee22 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -47,7 +47,7 @@ static cl::opt CallWithManyArgumentsThreshold( "it is considered having many arguments.")); namespace { -int64_t getNrBlocksFromCond(const BasicBlock &BB) { +int64_t getNumBlocksFromCond(const BasicBlock &BB) { int64_t Ret = 0; if (const auto *BI = dyn_cast(BB.getTerminator())) { if (BI->isConditional()) @@ -72,7 +72,7 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB, assert(Direction == 1 || Direction == -1); BasicBlockCount += Direction; BlocksReachedFromConditionalInstruction += - (Direction * getNrBlocksFromCond(BB)); + (Direction * getNumBlocksFromCond(BB)); for (const auto &I : BB) { if (auto *CS = dyn_cast(&I)) { const auto *Callee = CS->getCalledFunction(); diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index 8bb5efcf1b2ec..2db58d1c2578b 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -400,9 +400,9 @@ std::unique_ptr MLInlineAdvisor::getAdviceImpl(CallBase &CB) { if (Mandatory) return getMandatoryAdvice(CB, true); - auto NrCtantParams = 0; + auto NumCtantParams = 0; for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) { - NrCtantParams += (isa(*I)); + NumCtantParams += (isa(*I)); } auto &CallerBefore = getCachedFPI(Caller); @@ -414,7 +414,7 @@ std::unique_ptr MLInlineAdvisor::getAdviceImpl(CallBase &CB) { getInitialFunctionLevel(Caller); *ModelRunner->getTensor(FeatureIndex::node_count) = NodeCount; *ModelRunner->getTensor(FeatureIndex::nr_ctant_params) = - NrCtantParams; + NumCtantParams; *ModelRunner->getTensor(FeatureIndex::edge_count) = EdgeCount; *ModelRunner->getTensor(FeatureIndex::caller_users) = CallerBefore.Uses; diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index 0a124ac3fd1b6..d099544c2a491 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -273,7 +273,7 @@ struct LIFeatureComponents { double RW = 0; double IndVarUpdates = 0; double HintWeights = 0.0; - int64_t NrDefsAndUses = 0; + int64_t NumDefsAndUses = 0; float HottestBlockFreq = 0.0; bool IsRemat = false; }; @@ -327,7 +327,7 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor { void extractFeatures(const SmallVectorImpl &Intervals, llvm::SmallVectorImpl &Largest, size_t Pos, - int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent, + int64_t IsHint, int64_t LocalIntfsCount, float NumUrgent, SmallVectorImpl &LRPosInfo) const; // Point-in-time: we didn't learn this, so we always delegate to the @@ -609,7 +609,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures( const bool IsLocal = LIS->intervalIsInOneMBB(VirtReg); int64_t LocalIntfs = 0; - float NrUrgent = 0.0f; + float NumUrgent = 0.0f; // The cascade tracking is the same as in the default advisor unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg()); @@ -649,7 +649,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures( if (Cascade <= IntfCascade) { if (!Urgent) return false; - ++NrUrgent; + ++NumUrgent; } LocalIntfs += (IsLocal && LIS->intervalIsInOneMBB(*Intf) && @@ -659,7 +659,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures( // OK, so if we made it this far, this LR is an eviction candidate, load its // features. extractFeatures(InterferingIntervals, Largest, Pos, IsHint, LocalIntfs, - NrUrgent, LRPosInfo); + NumUrgent, LRPosInfo); return true; } @@ -731,7 +731,7 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate( extractFeatures(SmallVector(1, &VirtReg), Largest, CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0, - /*NrUrgent*/ 0.0, LRPosInfo); + /*NumUrgent*/ 0.0, LRPosInfo); assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had " "nothing to allocate initially."); #ifdef LLVM_HAVE_TFLITE @@ -809,7 +809,7 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const { I != E;) { MachineInstr *MI = &*(I++); - ++Ret.NrDefsAndUses; + ++Ret.NumDefsAndUses; if (!Visited.insert(MI).second) continue; @@ -846,10 +846,10 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const { void MLEvictAdvisor::extractFeatures( const SmallVectorImpl &Intervals, llvm::SmallVectorImpl &Largest, size_t Pos, int64_t IsHint, - int64_t LocalIntfsCount, float NrUrgent, + int64_t LocalIntfsCount, float NumUrgent, SmallVectorImpl &LRPosInfo) const { - int64_t NrDefsAndUses = 0; - int64_t NrBrokenHints = 0; + int64_t NumDefsAndUses = 0; + int64_t NumBrokenHints = 0; double R = 0.0; double W = 0.0; double RW = 0.0; @@ -858,7 +858,7 @@ void MLEvictAdvisor::extractFeatures( float StartBBFreq = 0.0; float EndBBFreq = 0.0; float HottestBlockFreq = 0.0; - int32_t NrRematerializable = 0; + int32_t NumRematerializable = 0; float TotalWeight = 0.0; SlotIndex EndSI = LIS->getSlotIndexes()->getZeroIndex(); @@ -882,9 +882,9 @@ void MLEvictAdvisor::extractFeatures( if (LI.endIndex() > EndSI) EndSI = LI.endIndex(); const LIFeatureComponents &LIFC = getLIFeatureComponents(LI); - NrBrokenHints += VRM->hasPreferredPhys(LI.reg()); + NumBrokenHints += VRM->hasPreferredPhys(LI.reg()); - NrDefsAndUses += LIFC.NrDefsAndUses; + NumDefsAndUses += LIFC.NumDefsAndUses; HottestBlockFreq = std::max(HottestBlockFreq, LIFC.HottestBlockFreq); R += LIFC.R; W += LIFC.W; @@ -893,7 +893,7 @@ void MLEvictAdvisor::extractFeatures( IndVarUpdates += LIFC.IndVarUpdates; HintWeights += LIFC.HintWeights; - NrRematerializable += LIFC.IsRemat; + NumRematerializable += LIFC.IsRemat; if (EnableDevelopmentFeatures) { for (auto CurrentSegment : LI) { @@ -922,12 +922,12 @@ void MLEvictAdvisor::extractFeatures( } while (false) SET(mask, int64_t, 1); SET(is_free, int64_t, Intervals.empty()); - SET(nr_urgent, float, NrUrgent); - SET(nr_broken_hints, float, NrBrokenHints); + SET(nr_urgent, float, NumUrgent); + SET(nr_broken_hints, float, NumBrokenHints); SET(is_hint, int64_t, IsHint); SET(is_local, int64_t, LocalIntfsCount); - SET(nr_rematerializable, float, NrRematerializable); - SET(nr_defs_and_uses, float, NrDefsAndUses); + SET(nr_rematerializable, float, NumRematerializable); + SET(nr_defs_and_uses, float, NumDefsAndUses); SET(weighed_reads_by_max, float, R); SET(weighed_writes_by_max, float, W); SET(weighed_read_writes_by_max, float, RW); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index 43bebc99316e0..b620306628729 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -71,33 +71,33 @@ class CtxInstrumentationLowerer final { // of its parameters, and llvm.instrprof.callsite captures the total number of // callsites. Those values are the same for instances of those intrinsics in // this function. Find the first instance of each and return them. -std::pair getNrCountersAndCallsites(const Function &F) { - uint32_t NrCounters = 0; - uint32_t NrCallsites = 0; +std::pair getNumCountersAndCallsites(const Function &F) { + uint32_t NumCounters = 0; + uint32_t NumCallsites = 0; for (const auto &BB : F) { for (const auto &I : BB) { if (const auto *Incr = dyn_cast(&I)) { uint32_t V = static_cast(Incr->getNumCounters()->getZExtValue()); - assert((!NrCounters || V == NrCounters) && + assert((!NumCounters || V == NumCounters) && "expected all llvm.instrprof.increment[.step] intrinsics to " "have the same total nr of counters parameter"); - NrCounters = V; + NumCounters = V; } else if (const auto *CSIntr = dyn_cast(&I)) { uint32_t V = static_cast(CSIntr->getNumCounters()->getZExtValue()); - assert((!NrCallsites || V == NrCallsites) && + assert((!NumCallsites || V == NumCallsites) && "expected all llvm.instrprof.callsite intrinsics to have the " "same total nr of callsites parameter"); - NrCallsites = V; + NumCallsites = V; } #if NDEBUG - if (NrCounters && NrCallsites) - return std::make_pair(NrCounters, NrCallsites); + if (NumCounters && NumCallsites) + return std::make_pair(NumCounters, NumCallsites); #endif } } - return {NrCounters, NrCallsites}; + return {NumCounters, NumCallsites}; } } // namespace @@ -124,8 +124,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, ContextNodeTy = StructType::get(M.getContext(), { I64Ty, /*Guid*/ PointerTy, /*Next*/ - I32Ty, /*NrCounters*/ - I32Ty, /*NrCallsites*/ + I32Ty, /*NumCounters*/ + I32Ty, /*NumCallsites*/ }); // Define a global for each entrypoint. We'll reuse the entrypoint's name as @@ -157,7 +157,7 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, FunctionType::get(ContextNodeTy->getPointerTo(), {ContextRootTy->getPointerTo(), /*ContextRoot*/ I64Ty, /*Guid*/ I32Ty, - /*NrCounters*/ I32Ty /*NrCallsites*/}, + /*NumCounters*/ I32Ty /*NumCallsites*/}, false)) .getCallee()); GetCtx = cast( @@ -165,8 +165,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, FunctionType::get(ContextNodeTy->getPointerTo(), {PointerTy, /*Callee*/ I64Ty, /*Guid*/ - I32Ty, /*NrCounters*/ - I32Ty}, /*NrCallsites*/ + I32Ty, /*NumCounters*/ + I32Ty}, /*NumCallsites*/ false)) .getCallee()); ReleaseCtx = cast( @@ -208,7 +208,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { auto &ORE = FAM.getResult(F); Value *Guid = nullptr; - auto [NrCounters, NrCallsites] = getNrCountersAndCallsites(F); + auto [NumCounters, NumCallsites] = getNumCountersAndCallsites(F); Value *Context = nullptr; Value *RealContext = nullptr; @@ -229,12 +229,12 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Guid = Builder.getInt64( AssignGUIDPass::getGUID(cast(*Mark->getNameValue()))); // The type of the context of this function is now knowable since we have - // NrCallsites and NrCounters. We delcare it here because it's more + // NumCallsites and NumCounters. We delcare it here because it's more // convenient - we have the Builder. ThisContextType = StructType::get( F.getContext(), - {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NrCounters), - ArrayType::get(Builder.getPtrTy(), NrCallsites)}); + {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NumCounters), + ArrayType::get(Builder.getPtrTy(), NumCallsites)}); // Figure out which way we obtain the context object for this function - // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the // former case, we also set TheRootContext since we need to release it @@ -243,22 +243,22 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { auto Iter = ContextRootMap.find(&F); if (Iter != ContextRootMap.end()) { TheRootContext = Iter->second; - Context = Builder.CreateCall(StartCtx, {TheRootContext, Guid, - Builder.getInt32(NrCounters), - Builder.getInt32(NrCallsites)}); + Context = Builder.CreateCall( + StartCtx, {TheRootContext, Guid, Builder.getInt32(NumCounters), + Builder.getInt32(NumCallsites)}); ORE.emit( [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); }); } else { Context = - Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NrCounters), - Builder.getInt32(NrCallsites)}); + Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NumCounters), + Builder.getInt32(NumCallsites)}); ORE.emit([&] { return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F); }); } // The context could be scratch. auto *CtxAsInt = Builder.CreatePtrToInt(Context, Builder.getInt64Ty()); - if (NrCallsites > 0) { + if (NumCallsites > 0) { // Figure out which index of the TLS 2-element buffers to use. // Scratch context => we use index == 1. Real contexts => index == 0. auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1)); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 8dd0cfdb2ae0a..9dd4a561edfdd 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -953,15 +953,15 @@ void FunctionInstrumenter::instrument() { } }; // First, count callsites. - uint32_t TotalNrCallsites = 0; - Visit([&TotalNrCallsites](auto *) { ++TotalNrCallsites; }); + uint32_t TotalNumCallsites = 0; + Visit([&TotalNumCallsites](auto *) { ++TotalNumCallsites; }); // Now instrument. uint32_t CallsiteIndex = 0; Visit([&](auto *CB) { IRBuilder<> Builder(CB); Builder.CreateCall(CSIntrinsic, - {Name, CFGHash, Builder.getInt32(TotalNrCallsites), + {Name, CFGHash, Builder.getInt32(TotalNumCallsites), Builder.getInt32(CallsiteIndex++), CB->getCalledOperand()}); }); diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp index 7be01445558ec..f48f4f1ac9cc1 100644 --- a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp +++ b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp @@ -24,12 +24,12 @@ class PGOCtxProfRWTest : public ::testing::Test { std::map Roots; public: - ContextNode *createNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites, - ContextNode *Next = nullptr) { - auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites); + ContextNode *createNode(GUID Guid, uint32_t NumCounters, + uint32_t NumCallsites, ContextNode *Next = nullptr) { + auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); auto *Mem = Nodes.emplace_back(std::make_unique(AllocSize)).get(); std::memset(Mem, 0, AllocSize); - auto *Ret = new (Mem) ContextNode(Guid, NrCounters, NrCallsites, Next); + auto *Ret = new (Mem) ContextNode(Guid, NumCounters, NumCallsites, Next); return Ret; } From 9de972e3e1ecea506a3884bd2fc47570ce83e4df Mon Sep 17 00:00:00 2001 From: Ming-Yi Lai Date: Fri, 6 Sep 2024 03:35:12 +0800 Subject: [PATCH 281/425] [clang] Fix FnInfoOpts::operator&= and FnInfoOpts::operator|= not updating assigned operands (#107050) This affects CodeGenTypes::arrangeCall. No test because the only current in-tree use of that function isn't affected. --- clang/lib/CodeGen/CGCall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h index 6fa65e1916183..92e0cc43919ca 100644 --- a/clang/lib/CodeGen/CGCall.h +++ b/clang/lib/CodeGen/CGCall.h @@ -450,12 +450,12 @@ inline FnInfoOpts operator&(FnInfoOpts A, FnInfoOpts B) { llvm::to_underlying(B)); } -inline FnInfoOpts operator|=(FnInfoOpts A, FnInfoOpts B) { +inline FnInfoOpts &operator|=(FnInfoOpts &A, FnInfoOpts B) { A = A | B; return A; } -inline FnInfoOpts operator&=(FnInfoOpts A, FnInfoOpts B) { +inline FnInfoOpts &operator&=(FnInfoOpts &A, FnInfoOpts B) { A = A & B; return A; } From 5515b086f35c2c1402234b9b94338f10fc9d16f7 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 12:35:26 -0700 Subject: [PATCH 282/425] Factor Process::ExecutionResultAsCString() into a global function (NFC) --- lldb/include/lldb/Target/Process.h | 2 - lldb/include/lldb/Utility/Status.h | 2 + lldb/source/Expression/FunctionCaller.cpp | 3 +- lldb/source/Expression/LLVMUserExpression.cpp | 6 +-- lldb/source/Target/Process.cpp | 37 ------------------- lldb/source/Utility/Status.cpp | 26 +++++++++++++ 6 files changed, 32 insertions(+), 44 deletions(-) diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index a7de991104434..c66cfb2c245ef 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -1296,8 +1296,6 @@ class Process : public std::enable_shared_from_this, const EvaluateExpressionOptions &options, DiagnosticManager &diagnostic_manager); - static const char *ExecutionResultAsCString(lldb::ExpressionResults result); - void GetStatus(Stream &ostrm); size_t GetThreadStatus(Stream &ostrm, bool only_threads_with_stop_reason, diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index 3813a3c160470..308532e7c633d 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -26,6 +26,8 @@ class raw_ostream; namespace lldb_private { +const char *ExpressionResultAsCString(lldb::ExpressionResults result); + /// \class Status Status.h "lldb/Utility/Status.h" An error handling class. /// /// This class is designed to be able to hold any error code that can be diff --git a/lldb/source/Expression/FunctionCaller.cpp b/lldb/source/Expression/FunctionCaller.cpp index 5ac2b0681ebbe..5ce0175fedf45 100644 --- a/lldb/source/Expression/FunctionCaller.cpp +++ b/lldb/source/Expression/FunctionCaller.cpp @@ -390,8 +390,7 @@ lldb::ExpressionResults FunctionCaller::ExecuteFunction( LLDB_LOGF(log, "== [FunctionCaller::ExecuteFunction] Execution of \"%s\" " "completed abnormally: %s ==", - m_name.c_str(), - Process::ExecutionResultAsCString(return_value)); + m_name.c_str(), ExpressionResultAsCString(return_value)); } else { LLDB_LOGF(log, "== [FunctionCaller::ExecuteFunction] Execution of \"%s\" " diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp index b4fdfc4d1fa8b..9e07d4a7374a7 100644 --- a/lldb/source/Expression/LLVMUserExpression.cpp +++ b/lldb/source/Expression/LLVMUserExpression.cpp @@ -235,9 +235,9 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager, expr_thread_id); return execution_result; } else if (execution_result != lldb::eExpressionCompleted) { - diagnostic_manager.Printf( - lldb::eSeverityError, "Couldn't execute function; result was %s", - Process::ExecutionResultAsCString(execution_result)); + diagnostic_manager.Printf(lldb::eSeverityError, + "Couldn't execute function; result was %s", + ExpressionResultAsCString(execution_result)); return execution_result; } } diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 3d7ddbe294a49..f2a631a466b35 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -5743,43 +5743,6 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx, return return_value; } -const char *Process::ExecutionResultAsCString(ExpressionResults result) { - const char *result_name = ""; - - switch (result) { - case eExpressionCompleted: - result_name = "eExpressionCompleted"; - break; - case eExpressionDiscarded: - result_name = "eExpressionDiscarded"; - break; - case eExpressionInterrupted: - result_name = "eExpressionInterrupted"; - break; - case eExpressionHitBreakpoint: - result_name = "eExpressionHitBreakpoint"; - break; - case eExpressionSetupError: - result_name = "eExpressionSetupError"; - break; - case eExpressionParseError: - result_name = "eExpressionParseError"; - break; - case eExpressionResultUnavailable: - result_name = "eExpressionResultUnavailable"; - break; - case eExpressionTimedOut: - result_name = "eExpressionTimedOut"; - break; - case eExpressionStoppedForDebug: - result_name = "eExpressionStoppedForDebug"; - break; - case eExpressionThreadVanished: - result_name = "eExpressionThreadVanished"; - } - return result_name; -} - void Process::GetStatus(Stream &strm) { const StateType state = GetState(); if (StateIsStoppedState(state, false)) { diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index 131fc662bfc0a..40e1fbf3fab1b 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -204,3 +204,29 @@ void llvm::format_provider::format( llvm::format_provider::format(error.AsCString(), OS, Options); } + +const char *lldb_private::ExpressionResultAsCString(ExpressionResults result) { + switch (result) { + case eExpressionCompleted: + return "eExpressionCompleted"; + case eExpressionDiscarded: + return "eExpressionDiscarded"; + case eExpressionInterrupted: + return "eExpressionInterrupted"; + case eExpressionHitBreakpoint: + return "eExpressionHitBreakpoint"; + case eExpressionSetupError: + return "eExpressionSetupError"; + case eExpressionParseError: + return "eExpressionParseError"; + case eExpressionResultUnavailable: + return "eExpressionResultUnavailable"; + case eExpressionTimedOut: + return "eExpressionTimedOut"; + case eExpressionStoppedForDebug: + return "eExpressionStoppedForDebug"; + case eExpressionThreadVanished: + return "eExpressionThreadVanished"; + } + return ""; +} From 53d5ffea6be7216589599b6415c021f8bd13cd37 Mon Sep 17 00:00:00 2001 From: Fabian Parzefall Date: Thu, 5 Sep 2024 12:39:39 -0700 Subject: [PATCH 283/425] [clang] Check inline defs when emitting speculative vtable (#100785) Clang should only emit an available_externally vtable when there are no unused virtual inline functions. Currently, if such such a function is declared without inline inside the class, but is defined inline outside the class, Clang might emit the vtable as available_externally. This happens because Clang only considers the declarations of vtable entries, but not the definitions. This patch addresses this by inspecting the definitions in addition to the declarations. --- clang/lib/CodeGen/ItaniumCXXABI.cpp | 19 +++++++++++--- .../vtable-available-externally.cpp | 25 +++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 0cde8a192eda0..fb1eb72d9f340 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -442,7 +442,10 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI { continue; const CXXMethodDecl *Method = VtableComponent.getFunctionDecl(); - if (!Method->getCanonicalDecl()->isInlined()) + const FunctionDecl *FD = Method->getDefinition(); + const bool IsInlined = + Method->getCanonicalDecl()->isInlined() || (FD && FD->isInlined()); + if (!IsInlined) continue; StringRef Name = CGM.getMangledName(VtableComponent.getGlobalDecl()); @@ -2279,8 +2282,18 @@ bool ItaniumCXXABI::canSpeculativelyEmitVTableAsBaseClass( if (CGM.getCodeGenOpts().ForceEmitVTables) return true; - // If we don't have any not emitted inline virtual function then we are safe - // to emit an available_externally copy of vtable. + // A speculative vtable can only be generated if all virtual inline functions + // defined by this class are emitted. The vtable in the final program contains + // for each virtual inline function not used in the current TU a function that + // is equivalent to the unused function. The function in the actual vtable + // does not have to be declared under the same symbol (e.g., a virtual + // destructor that can be substituted with its base class's destructor). Since + // inline functions are emitted lazily and this emissions does not account for + // speculative emission of a vtable, we might generate a speculative vtable + // with references to inline functions that are not emitted under that name. + // This can lead to problems when devirtualizing a call to such a function, + // that result in linking errors. Hence, if there are any unused virtual + // inline function, we cannot emit the speculative vtable. // FIXME we can still emit a copy of the vtable if we // can emit definition of the inline functions. if (hasAnyUnusedVirtualInlineFunction(RD)) diff --git a/clang/test/CodeGenCXX/vtable-available-externally.cpp b/clang/test/CodeGenCXX/vtable-available-externally.cpp index a57eb39edfe10..ab105260bc75a 100644 --- a/clang/test/CodeGenCXX/vtable-available-externally.cpp +++ b/clang/test/CodeGenCXX/vtable-available-externally.cpp @@ -250,28 +250,39 @@ struct C : A { virtual void car(); }; +// Inline definition outside body, so we can't emit vtable available_externally +// (see previous). +// CHECK-TEST10-DAG: @_ZTVN6Test101FE = external unnamed_addr constant +struct F : A { + void foo(); + virtual void cat(); // inline outside body +}; +inline void F::cat() {} + // no key function, vtable will be generated everywhere it will be used // CHECK-TEST10-DAG: @_ZTVN6Test101EE = linkonce_odr unnamed_addr constant // CHECK-FORCE-EMIT-DAG: @_ZTVN6Test101EE = linkonce_odr unnamed_addr constant struct E : A {}; -void g(A& a) { +void h(A& a) { a.foo(); a.bar(); } -void f() { +void g() { A a; - g(a); + h(a); B b; - g(b); + h(b); C c; - g(c); + h(c); D d; - g(d); + h(d); E e; - g(e); + h(e); + F f; + h(f); } } // Test10 From b798f4bd50bbf0f5eb46804afad10629797c73aa Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 12:44:13 -0700 Subject: [PATCH 284/425] [lldb] Make deep copies of Status explicit (NFC) (#107170) --- lldb/bindings/python/python-swigsafecast.swig | 4 +-- lldb/include/lldb/API/SBError.h | 4 +-- lldb/include/lldb/API/SBValueList.h | 2 +- .../lldb/Core/ValueObjectConstResult.h | 4 +-- lldb/include/lldb/Utility/Status.h | 27 +++++++++++++++++-- lldb/source/API/SBBreakpoint.cpp | 6 ++--- lldb/source/API/SBBreakpointLocation.cpp | 4 +-- lldb/source/API/SBBreakpointName.cpp | 17 ++++++------ lldb/source/API/SBDebugger.cpp | 2 +- lldb/source/API/SBError.cpp | 15 ++++++----- lldb/source/API/SBFile.cpp | 15 ++++------- lldb/source/API/SBFormat.cpp | 2 +- lldb/source/API/SBFrame.cpp | 9 ++++--- lldb/source/API/SBPlatform.cpp | 4 +-- lldb/source/API/SBProcess.cpp | 2 +- lldb/source/API/SBSaveCoreOptions.cpp | 3 +-- lldb/source/API/SBStructuredData.cpp | 2 +- lldb/source/API/SBTarget.cpp | 5 ++-- lldb/source/API/SBThread.cpp | 2 +- lldb/source/API/SBValue.cpp | 4 +-- lldb/source/API/SBValueList.cpp | 13 ++++----- lldb/source/API/SBWatchpoint.cpp | 2 +- .../source/Commands/CommandObjectCommands.cpp | 4 +-- lldb/source/Core/Debugger.cpp | 2 +- lldb/source/Core/ModuleList.cpp | 5 ++-- lldb/source/Core/ValueObject.cpp | 4 +-- lldb/source/Core/ValueObjectCast.cpp | 2 +- lldb/source/Core/ValueObjectConstResult.cpp | 9 ++++--- lldb/source/Core/ValueObjectDynamicValue.cpp | 2 +- .../Core/ValueObjectSyntheticFilter.cpp | 2 +- lldb/source/Expression/Materializer.cpp | 2 +- lldb/source/Expression/UserExpression.cpp | 8 +++--- lldb/source/Host/common/LockFileBase.cpp | 4 +-- .../Host/common/NativeProcessProtocol.cpp | 8 +++--- .../posix/ConnectionFileDescriptorPosix.cpp | 12 ++++----- .../source/Interpreter/CommandInterpreter.cpp | 2 +- lldb/source/Interpreter/ScriptInterpreter.cpp | 2 +- .../Plugins/Platform/Android/AdbClient.cpp | 8 +++--- .../PlatformAndroidRemoteGDBServer.cpp | 4 +-- ...PlatformiOSSimulatorCoreSimulatorSupport.h | 2 +- ...latformiOSSimulatorCoreSimulatorSupport.mm | 12 ++++----- .../Plugins/Platform/POSIX/PlatformPOSIX.cpp | 4 +-- .../Platform/Windows/PlatformWindows.cpp | 4 +-- .../ScriptedProcessPythonInterface.cpp | 6 ++--- .../Interfaces/ScriptedPythonInterface.h | 8 ++++-- .../Python/SWIGPythonBridge.h | 2 +- .../Plugins/SymbolFile/DWARF/DWARFUnit.h | 2 +- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 2 +- .../DWARF/SymbolFileDWARFDebugMap.cpp | 2 +- lldb/source/Target/ModuleCache.cpp | 2 +- lldb/source/Target/Platform.cpp | 4 +-- lldb/source/Target/Process.cpp | 4 +-- lldb/source/Target/StackFrame.cpp | 2 +- lldb/source/Target/Target.cpp | 12 ++++----- lldb/source/Utility/Status.cpp | 7 +++++ .../Python/PythonTestSuite.cpp | 2 +- .../Target/RemoteAwarePlatformTest.cpp | 8 +++--- 57 files changed, 172 insertions(+), 141 deletions(-) diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig index 34f8c6f0ff8d3..0127ac6bfa468 100644 --- a/lldb/bindings/python/python-swigsafecast.swig +++ b/lldb/bindings/python/python-swigsafecast.swig @@ -33,8 +33,8 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp) { SWIGTYPE_p_lldb__SBBreakpoint); } -PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) { - return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError); +PythonObject SWIGBridge::ToSWIGWrapper(Status status) { + return ToSWIGHelper(new lldb::SBError(std::move(status)), SWIGTYPE_p_lldb__SBError); } PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr data_sb) { diff --git a/lldb/include/lldb/API/SBError.h b/lldb/include/lldb/API/SBError.h index 17f2c6c3027af..9f55f92131c06 100644 --- a/lldb/include/lldb/API/SBError.h +++ b/lldb/include/lldb/API/SBError.h @@ -97,7 +97,7 @@ class LLDB_API SBError { friend class lldb_private::ScriptInterpreter; friend class lldb_private::python::SWIGBridge; - SBError(const lldb_private::Status &error); + SBError(lldb_private::Status &&error); lldb_private::Status *get(); @@ -107,7 +107,7 @@ class LLDB_API SBError { lldb_private::Status &ref(); - void SetError(const lldb_private::Status &lldb_error); + void SetError(lldb_private::Status &&lldb_error); private: std::unique_ptr m_opaque_up; diff --git a/lldb/include/lldb/API/SBValueList.h b/lldb/include/lldb/API/SBValueList.h index a5017bccc5053..52a86f989e153 100644 --- a/lldb/include/lldb/API/SBValueList.h +++ b/lldb/include/lldb/API/SBValueList.h @@ -96,7 +96,7 @@ class LLDB_API SBValueList { std::unique_ptr m_opaque_up; - void SetError(const lldb_private::Status &status); + void SetError(lldb_private::Status &&status); }; } // namespace lldb diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h index d3b3362bd0e9e..9c34617af71d0 100644 --- a/lldb/include/lldb/Core/ValueObjectConstResult.h +++ b/lldb/include/lldb/Core/ValueObjectConstResult.h @@ -61,7 +61,7 @@ class ValueObjectConstResult : public ValueObject { // When an expression fails to evaluate, we return an error static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, - const Status &error); + Status &&error); std::optional GetByteSize() override; @@ -146,7 +146,7 @@ class ValueObjectConstResult : public ValueObject { ConstString name, Module *module = nullptr); ValueObjectConstResult(ExecutionContextScope *exe_scope, - ValueObjectManager &manager, const Status &error); + ValueObjectManager &manager, Status &&error); ValueObject *CreateChildAtIndex(size_t idx) override { return m_impl.CreateChildAtIndex(idx); diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index 308532e7c633d..795c830b96517 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -43,13 +43,32 @@ const char *ExpressionResultAsCString(lldb::ExpressionResults result); /// of themselves for printing results and error codes. The string value will /// be fetched on demand and its string value will be cached until the error /// is cleared of the value of the error changes. +/// +/// API design notes: +/// +/// Most APIs that currently vend a Status would be better served by +/// returning llvm::Expected<> instead. If possibles APIs should be +/// refactored to avoid Status. The only legitimate long-term uses of +/// Status are objects that need to store an error for a long time +/// (which should be questioned as a design decision, too). +/// +/// Implementation notes: +/// +/// Internally, Status stores an llvm::Error. +/// eErrorTypeInvalid +/// eErrorTypeGeneric llvm::StringError +/// eErrorTypePOSIX llvm::ECError +/// eErrorTypeMachKernel MachKernelError +/// eErrorTypeExpression llvm::ErrorList +/// eErrorTypeWin32 Win32Error + class Status { public: - /// Every error value that this object can contain needs to be able to fit /// into ValueType. typedef uint32_t ValueType; Status(); + Status(Status &&other) = default; /// Initialize the error object with a generic success value. /// @@ -93,10 +112,14 @@ class Status { ~Status(); + const Status &operator=(Status &&); /// Avoid using this in new code. Migrate APIs to llvm::Expected instead. static Status FromError(llvm::Error error); - /// FIXME: Replace this with a takeError method. + /// FIXME: Replace this with a takeError() method. llvm::Error ToError() const; + /// Don't call this function in new code. Instead, redesign the API + /// to use llvm::Expected instead of Status. + Status Clone() const { return Status(ToError()); } /// Get the error string associated with the current error. // diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp index 3e9c01080588e..b2ed034d19983 100644 --- a/lldb/source/API/SBBreakpoint.cpp +++ b/lldb/source/API/SBBreakpoint.cpp @@ -622,7 +622,7 @@ SBError SBBreakpoint::SetScriptCallbackFunction( callback_function_name, extra_args.m_impl_up ->GetObjectSP()); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); } else sb_error = Status::FromErrorString("invalid breakpoint"); @@ -645,7 +645,7 @@ SBError SBBreakpoint::SetScriptCallbackBody(const char *callback_body_text) { .GetScriptInterpreter() ->SetBreakpointCommandCallback(bp_options, callback_body_text, /*is_callback=*/false); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); } else sb_error = Status::FromErrorString("invalid breakpoint"); @@ -670,7 +670,7 @@ SBError SBBreakpoint::AddNameWithErrorHandling(const char *new_name) { bkpt_sp->GetTarget().GetAPIMutex()); Status error; bkpt_sp->GetTarget().AddNameToBreakpoint(bkpt_sp, new_name, error); - status.SetError(error); + status.SetError(std::move(error)); } else { status = Status::FromErrorString("invalid breakpoint"); } diff --git a/lldb/source/API/SBBreakpointLocation.cpp b/lldb/source/API/SBBreakpointLocation.cpp index e5c96b81e8090..b2d1da3927c6e 100644 --- a/lldb/source/API/SBBreakpointLocation.cpp +++ b/lldb/source/API/SBBreakpointLocation.cpp @@ -239,7 +239,7 @@ SBError SBBreakpointLocation::SetScriptCallbackFunction( callback_function_name, extra_args.m_impl_up ->GetObjectSP()); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); } else sb_error = Status::FromErrorString("invalid breakpoint"); @@ -264,7 +264,7 @@ SBBreakpointLocation::SetScriptCallbackBody(const char *callback_body_text) { .GetScriptInterpreter() ->SetBreakpointCommandCallback(bp_options, callback_body_text, /*is_callback=*/false); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); } else sb_error = Status::FromErrorString("invalid breakpoint"); diff --git a/lldb/source/API/SBBreakpointName.cpp b/lldb/source/API/SBBreakpointName.cpp index 7dc8dee19f43d..831260d44e8e7 100644 --- a/lldb/source/API/SBBreakpointName.cpp +++ b/lldb/source/API/SBBreakpointName.cpp @@ -570,14 +570,13 @@ SBError SBBreakpointName::SetScriptCallbackFunction( m_impl_up->GetTarget()->GetAPIMutex()); BreakpointOptions &bp_options = bp_name->GetOptions(); - Status error; - error = m_impl_up->GetTarget() - ->GetDebugger() - .GetScriptInterpreter() - ->SetBreakpointCommandCallbackFunction( - bp_options, callback_function_name, - extra_args.m_impl_up->GetObjectSP()); - sb_error.SetError(error); + Status error = m_impl_up->GetTarget() + ->GetDebugger() + .GetScriptInterpreter() + ->SetBreakpointCommandCallbackFunction( + bp_options, callback_function_name, + extra_args.m_impl_up->GetObjectSP()); + sb_error.SetError(std::move(error)); UpdateName(*bp_name); return sb_error; } @@ -600,7 +599,7 @@ SBBreakpointName::SetScriptCallbackBody(const char *callback_body_text) { .GetScriptInterpreter() ->SetBreakpointCommandCallback( bp_options, callback_body_text, /*is_callback=*/false); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); if (!sb_error.Fail()) UpdateName(*bp_name); diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index c226acc15018e..b21d7e6729007 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -1360,7 +1360,7 @@ SBError SBDebugger::SetInternalVariable(const char *var_name, const char *value, "invalid debugger instance name '%s'", debugger_instance_name); } if (error.Fail()) - sb_error.SetError(error); + sb_error.SetError(std::move(error)); return sb_error; } diff --git a/lldb/source/API/SBError.cpp b/lldb/source/API/SBError.cpp index 30d9ccc78ee37..31964931649db 100644 --- a/lldb/source/API/SBError.cpp +++ b/lldb/source/API/SBError.cpp @@ -23,7 +23,8 @@ SBError::SBError() { LLDB_INSTRUMENT_VA(this); } SBError::SBError(const SBError &rhs) { LLDB_INSTRUMENT_VA(this, rhs); - m_opaque_up = clone(rhs.m_opaque_up); + if (rhs.m_opaque_up) + m_opaque_up = std::make_unique(rhs.m_opaque_up->Clone()); } SBError::SBError(const char *message) { @@ -32,8 +33,8 @@ SBError::SBError(const char *message) { SetErrorString(message); } -SBError::SBError(const lldb_private::Status &status) - : m_opaque_up(new Status(status)) { +SBError::SBError(lldb_private::Status &&status) + : m_opaque_up(new Status(std::move(status))) { LLDB_INSTRUMENT_VA(this, status); } @@ -43,7 +44,9 @@ const SBError &SBError::operator=(const SBError &rhs) { LLDB_INSTRUMENT_VA(this, rhs); if (this != &rhs) - m_opaque_up = clone(rhs.m_opaque_up); + if (rhs.m_opaque_up) + m_opaque_up = std::make_unique(rhs.m_opaque_up->Clone()); + return *this; } @@ -111,9 +114,9 @@ void SBError::SetError(uint32_t err, ErrorType type) { *m_opaque_up = Status(err, type); } -void SBError::SetError(const Status &lldb_error) { +void SBError::SetError(Status &&lldb_error) { CreateIfNeeded(); - *m_opaque_up = lldb_error; + *m_opaque_up = std::move(lldb_error); } void SBError::SetErrorToErrno() { diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp index 623708780f4c6..2ae4b1481afbf 100644 --- a/lldb/source/API/SBFile.cpp +++ b/lldb/source/API/SBFile.cpp @@ -62,8 +62,7 @@ SBError SBFile::Read(uint8_t *buf, size_t num_bytes, size_t *bytes_read) { error = Status::FromErrorString("invalid SBFile"); *bytes_read = 0; } else { - Status status = m_opaque_sp->Read(buf, num_bytes); - error.SetError(status); + error.SetError(m_opaque_sp->Read(buf, num_bytes)); *bytes_read = num_bytes; } return error; @@ -78,8 +77,7 @@ SBError SBFile::Write(const uint8_t *buf, size_t num_bytes, error = Status::FromErrorString("invalid SBFile"); *bytes_written = 0; } else { - Status status = m_opaque_sp->Write(buf, num_bytes); - error.SetError(status); + error.SetError(m_opaque_sp->Write(buf, num_bytes)); *bytes_written = num_bytes; } return error; @@ -92,8 +90,7 @@ SBError SBFile::Flush() { if (!m_opaque_sp) { error = Status::FromErrorString("invalid SBFile"); } else { - Status status = m_opaque_sp->Flush(); - error.SetError(status); + error.SetError(m_opaque_sp->Flush()); } return error; } @@ -106,10 +103,8 @@ bool SBFile::IsValid() const { SBError SBFile::Close() { LLDB_INSTRUMENT_VA(this); SBError error; - if (m_opaque_sp) { - Status status = m_opaque_sp->Close(); - error.SetError(status); - } + if (m_opaque_sp) + error.SetError(m_opaque_sp->Close()); return error; } diff --git a/lldb/source/API/SBFormat.cpp b/lldb/source/API/SBFormat.cpp index 51cddceea0372..080e219d64a36 100644 --- a/lldb/source/API/SBFormat.cpp +++ b/lldb/source/API/SBFormat.cpp @@ -36,7 +36,7 @@ SBFormat::SBFormat(const char *format, lldb::SBError &error) { FormatEntrySP format_entry_sp = std::make_shared(); Status status = FormatEntity::Parse(format, *format_entry_sp); - error.SetError(status); + error.SetError(std::move(status)); if (error.Success()) m_opaque_sp = format_entry_sp; } diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp index b1360e88bcd3a..30c44b974988d 100644 --- a/lldb/source/API/SBFrame.cpp +++ b/lldb/source/API/SBFrame.cpp @@ -809,7 +809,7 @@ SBValueList SBFrame::GetVariables(const lldb::SBVariablesOptions &options) { Status var_error; variable_list = frame->GetVariableList(true, &var_error); if (var_error.Fail()) - value_list.SetError(var_error); + value_list.SetError(std::move(var_error)); if (variable_list) { const size_t num_variables = variable_list->GetSize(); if (num_variables) { @@ -1033,7 +1033,8 @@ SBValue SBFrame::EvaluateExpression(const char *expr) { Status error; error = Status::FromErrorString("can't evaluate expressions when the " "process is running."); - ValueObjectSP error_val_sp = ValueObjectConstResult::Create(nullptr, error); + ValueObjectSP error_val_sp = + ValueObjectConstResult::Create(nullptr, std::move(error)); result.SetSP(error_val_sp, false); } return result; @@ -1129,13 +1130,13 @@ lldb::SBValue SBFrame::EvaluateExpression(const char *expr, Status error; error = Status::FromErrorString("can't evaluate expressions when the " "process is running."); - expr_value_sp = ValueObjectConstResult::Create(nullptr, error); + expr_value_sp = ValueObjectConstResult::Create(nullptr, std::move(error)); expr_result.SetSP(expr_value_sp, false); } } else { Status error; error = Status::FromErrorString("sbframe object is not valid."); - expr_value_sp = ValueObjectConstResult::Create(nullptr, error); + expr_value_sp = ValueObjectConstResult::Create(nullptr, std::move(error)); expr_result.SetSP(expr_value_sp, false); } diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp index 2f0f925302f16..394268b77aa21 100644 --- a/lldb/source/API/SBPlatform.cpp +++ b/lldb/source/API/SBPlatform.cpp @@ -586,7 +586,7 @@ SBProcess SBPlatform::Attach(SBAttachInfo &attach_info, Status status; ProcessSP process_sp = platform_sp->Attach(info, debugger.ref(), target.GetSP().get(), status); - error.SetError(status); + error.SetError(std::move(status)); return SBProcess(process_sp); } @@ -728,7 +728,7 @@ SBError SBPlatform::SetLocateModuleCallback( symbol_file_spec = symbol_file_spec_sb.ref(); } - return error.ref(); + return error.ref().Clone(); }); return SBError(); } diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 3eed51f0245f6..9773144723c34 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -1450,7 +1450,7 @@ lldb::SBError SBProcess::DeallocateMemory(lldb::addr_t ptr) { std::lock_guard guard( process_sp->GetTarget().GetAPIMutex()); Status error = process_sp->DeallocateMemory(ptr); - sb_error.SetError(error); + sb_error.SetError(std::move(error)); } else { sb_error = Status::FromErrorString("process is running"); } diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp index 2cd431611ef55..ef82b0253f119 100644 --- a/lldb/source/API/SBSaveCoreOptions.cpp +++ b/lldb/source/API/SBSaveCoreOptions.cpp @@ -40,8 +40,7 @@ SBSaveCoreOptions::operator=(const SBSaveCoreOptions &rhs) { SBError SBSaveCoreOptions::SetPluginName(const char *name) { LLDB_INSTRUMENT_VA(this, name); - lldb_private::Status error = m_opaque_up->SetPluginName(name); - return SBError(error); + return SBError(m_opaque_up->SetPluginName(name)); } void SBSaveCoreOptions::SetStyle(lldb::SaveCoreStyle style) { diff --git a/lldb/source/API/SBStructuredData.cpp b/lldb/source/API/SBStructuredData.cpp index 801ebf45e0e52..b891a34bd7c76 100644 --- a/lldb/source/API/SBStructuredData.cpp +++ b/lldb/source/API/SBStructuredData.cpp @@ -133,7 +133,7 @@ lldb::SBError SBStructuredData::GetDescription(lldb::SBStream &stream) const { Status error = m_impl_up->GetDescription(stream.ref()); SBError sb_error; - sb_error.SetError(error); + sb_error.SetError(std::move(error)); return sb_error; } diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index 41eb77e5506bc..1c1f7e2a03def 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -1369,7 +1369,7 @@ SBTarget::WatchpointCreateByAddress(lldb::addr_t addr, size_t size, CompilerType *type = nullptr; watchpoint_sp = target_sp->CreateWatchpoint(addr, size, type, watch_type, cw_error); - error.SetError(cw_error); + error.SetError(std::move(cw_error)); sb_watchpoint.SetSP(watchpoint_sp); } @@ -2326,7 +2326,8 @@ lldb::SBValue SBTarget::EvaluateExpression(const char *expr, Status error; error = Status::FromErrorString("can't evaluate expressions when the " "process is running."); - expr_value_sp = ValueObjectConstResult::Create(nullptr, error); + expr_value_sp = + ValueObjectConstResult::Create(nullptr, std::move(error)); } } else { target->EvaluateExpression(expr, frame, expr_value_sp, options.ref()); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 92868130f6222..7508eed5d6fdb 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -958,7 +958,7 @@ SBError SBThread::JumpToLine(lldb::SBFileSpec &file_spec, uint32_t line) { Thread *thread = exe_ctx.GetThreadPtr(); Status err = thread->JumpToLine(file_spec.ref(), line, true); - sb_error.SetError(err); + sb_error.SetError(std::move(err)); return sb_error; } diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index df0e82b6523fb..273aac5ad4798 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -270,7 +270,7 @@ SBError SBValue::GetError() { ValueLocker locker; lldb::ValueObjectSP value_sp(GetSP(locker)); if (value_sp) - sb_error.SetError(value_sp->GetError()); + sb_error.SetError(value_sp->GetError().Clone()); else sb_error = Status::FromErrorStringWithFormat("error: %s", locker.GetError().AsCString()); @@ -1476,7 +1476,7 @@ lldb::SBWatchpoint SBValue::Watch(bool resolve_location, bool read, bool write, CompilerType type(value_sp->GetCompilerType()); WatchpointSP watchpoint_sp = target_sp->CreateWatchpoint(addr, byte_size, &type, watch_type, rc); - error.SetError(rc); + error.SetError(std::move(rc)); if (watchpoint_sp) { sb_watchpoint.SetSP(watchpoint_sp); diff --git a/lldb/source/API/SBValueList.cpp b/lldb/source/API/SBValueList.cpp index ba7e06971dc36..63b915094baf2 100644 --- a/lldb/source/API/SBValueList.cpp +++ b/lldb/source/API/SBValueList.cpp @@ -22,13 +22,14 @@ class ValueListImpl { public: ValueListImpl() = default; - ValueListImpl(const ValueListImpl &rhs) = default; + ValueListImpl(const ValueListImpl &rhs) + : m_values(rhs.m_values), m_error(rhs.m_error.Clone()) {} ValueListImpl &operator=(const ValueListImpl &rhs) { if (this == &rhs) return *this; m_values = rhs.m_values; - m_error = rhs.m_error; + m_error = rhs.m_error.Clone(); return *this; } @@ -67,7 +68,7 @@ class ValueListImpl { const Status &GetError() const { return m_error; } - void SetError(const Status &error) { m_error = error; } + void SetError(Status &&error) { m_error = std::move(error); } private: std::vector m_values; @@ -205,10 +206,10 @@ lldb::SBError SBValueList::GetError() { LLDB_INSTRUMENT_VA(this); SBError sb_error; if (m_opaque_up) - sb_error.SetError(m_opaque_up->GetError()); + sb_error.SetError(m_opaque_up->GetError().Clone()); return sb_error; } -void SBValueList::SetError(const lldb_private::Status &status) { - ref().SetError(status); +void SBValueList::SetError(lldb_private::Status &&status) { + ref().SetError(std::move(status)); } diff --git a/lldb/source/API/SBWatchpoint.cpp b/lldb/source/API/SBWatchpoint.cpp index 9664bbe618600..21e2dcc01968e 100644 --- a/lldb/source/API/SBWatchpoint.cpp +++ b/lldb/source/API/SBWatchpoint.cpp @@ -86,7 +86,7 @@ SBError SBWatchpoint::GetError() { SBError sb_error; lldb::WatchpointSP watchpoint_sp(GetSP()); if (watchpoint_sp) { - sb_error.SetError(watchpoint_sp->GetError()); + sb_error.SetError(watchpoint_sp->GetError().Clone()); } return sb_error; } diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index f8f2b97eb898f..e3291640fa935 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -1874,8 +1874,8 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed { ~CommandObjectScriptingObjectParsed() override = default; - Status GetOptionsError() { return m_options_error; } - Status GetArgsError() { return m_args_error; } + Status GetOptionsError() { return m_options_error.Clone(); } + Status GetArgsError() { return m_args_error.Clone(); } bool WantsCompletion() override { return true; } bool IsRemovable() const override { return true; } diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 1266355578e32..9bdc5a3949751 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -258,7 +258,7 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx, StreamString feedback_stream; if (!target_sp->LoadScriptingResources(errors, feedback_stream)) { Stream &s = GetErrorStream(); - for (auto error : errors) { + for (auto &error : errors) { s.Printf("%s\n", error.AsCString()); } if (feedback_stream.GetSize()) diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index bba4199ce9e83..2b8ccab2406c6 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -1046,8 +1046,8 @@ bool ModuleList::LoadScriptingResourcesInTarget(Target *target, return false; std::lock_guard guard(m_modules_mutex); for (auto module : m_modules) { - Status error; if (module) { + Status error; if (!module->LoadScriptingResourceInTarget(target, error, feedback_stream)) { if (error.Fail() && error.AsCString()) { @@ -1058,8 +1058,7 @@ bool ModuleList::LoadScriptingResourcesInTarget(Target *target, .GetFileNameStrippingExtension() .GetCString(), error.AsCString()); - errors.push_back(error); - + errors.push_back(std::move(error)); if (!continue_on_error) return false; } diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index d56bd004e63c7..1bedd87e943dc 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -2763,7 +2763,7 @@ ValueObjectSP ValueObject::CreateConstantValue(ConstString name) { if (!valobj_sp) { ExecutionContext exe_ctx(GetExecutionContextRef()); valobj_sp = ValueObjectConstResult::Create( - exe_ctx.GetBestExecutionContextScope(), m_error); + exe_ctx.GetBestExecutionContextScope(), m_error.Clone()); } return valobj_sp; } @@ -2974,7 +2974,7 @@ ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) { return ValueObjectConstResult::Create( ExecutionContext(GetExecutionContextRef()).GetBestExecutionContextScope(), - error); + std::move(error)); } lldb::ValueObjectSP ValueObject::Clone(ConstString new_name) { diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp index c8e3164151417..308fa161180d4 100644 --- a/lldb/source/Core/ValueObjectCast.cpp +++ b/lldb/source/Core/ValueObjectCast.cpp @@ -86,7 +86,7 @@ bool ValueObjectCast::UpdateValue() { // The dynamic value failed to get an error, pass the error along if (m_error.Success() && m_parent->GetError().Fail()) - m_error = m_parent->GetError(); + m_error = m_parent->GetError().Clone(); SetValueIsValid(false); return false; } diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp index 879d3c3f6b037..60850c15e6a83 100644 --- a/lldb/source/Core/ValueObjectConstResult.cpp +++ b/lldb/source/Core/ValueObjectConstResult.cpp @@ -169,16 +169,17 @@ ValueObjectConstResult::ValueObjectConstResult( } ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope, - const Status &error) { + Status &&error) { auto manager_sp = ValueObjectManager::Create(); - return (new ValueObjectConstResult(exe_scope, *manager_sp, error))->GetSP(); + return (new ValueObjectConstResult(exe_scope, *manager_sp, std::move(error))) + ->GetSP(); } ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, - const Status &error) + Status &&error) : ValueObject(exe_scope, manager), m_impl(this) { - m_error = error; + m_error = std::move(error); SetIsConstant(); } diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp index d6c523e99f10d..67311ea6e3be4 100644 --- a/lldb/source/Core/ValueObjectDynamicValue.cpp +++ b/lldb/source/Core/ValueObjectDynamicValue.cpp @@ -118,7 +118,7 @@ bool ValueObjectDynamicValue::UpdateValue() { if (!m_parent->UpdateValueIfNeeded(false)) { // The dynamic value failed to get an error, pass the error along if (m_error.Success() && m_parent->GetError().Fail()) - m_error = m_parent->GetError(); + m_error = m_parent->GetError().Clone(); return false; } diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index adac1b400705e..091b8f883b605 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -169,7 +169,7 @@ bool ValueObjectSynthetic::UpdateValue() { // our parent could not update.. as we are meaningless without a parent, // just stop if (m_parent->GetError().Fail()) - m_error = m_parent->GetError(); + m_error = m_parent->GetError().Clone(); return false; } diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index fa0edad1fa583..8097e38919b04 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -462,7 +462,7 @@ class EntityVariableBase : public Materializer::Entity { return; } - Status valobj_error = valobj_sp->GetError(); + Status valobj_error = valobj_sp->GetError().Clone(); if (valobj_error.Fail()) { err = Status::FromErrorStringWithFormat( diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp index c2889e4c986bf..872f6304f91ba 100644 --- a/lldb/source/Expression/UserExpression.cpp +++ b/lldb/source/Expression/UserExpression.cpp @@ -268,10 +268,10 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, const bool generate_debug_info = options.GetGenerateDebugInfo(); if (options.InvokeCancelCallback(lldb::eExpressionEvaluationParse)) { - error = Status::FromErrorString( + Status error = Status::FromErrorString( "expression interrupted by callback before parse"); result_valobj_sp = ValueObjectConstResult::Create( - exe_ctx.GetBestExecutionContextScope(), error); + exe_ctx.GetBestExecutionContextScope(), std::move(error)); return lldb::eExpressionInterrupted; } @@ -364,7 +364,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, lldb::eExpressionInterrupted, "expression interrupted by callback before execution"); result_valobj_sp = ValueObjectConstResult::Create( - exe_ctx.GetBestExecutionContextScope(), error); + exe_ctx.GetBestExecutionContextScope(), std::move(error)); return lldb::eExpressionInterrupted; } @@ -415,7 +415,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, if (result_valobj_sp.get() == nullptr) { result_valobj_sp = ValueObjectConstResult::Create( - exe_ctx.GetBestExecutionContextScope(), error); + exe_ctx.GetBestExecutionContextScope(), std::move(error)); } return execution_results; diff --git a/lldb/source/Host/common/LockFileBase.cpp b/lldb/source/Host/common/LockFileBase.cpp index 01a8c38cef58e..6ef684e6d622c 100644 --- a/lldb/source/Host/common/LockFileBase.cpp +++ b/lldb/source/Host/common/LockFileBase.cpp @@ -50,7 +50,7 @@ Status LockFileBase::Unlock() { if (!IsLocked()) return NotLocked(); - const auto error = DoUnlock(); + Status error = DoUnlock(); if (error.Success()) { m_locked = false; m_start = 0; @@ -69,7 +69,7 @@ Status LockFileBase::DoLock(const Locker &locker, const uint64_t start, if (IsLocked()) return AlreadyLocked(); - const auto error = locker(start, len); + Status error = locker(start, len); if (error.Success()) { m_locked = true; m_start = start; diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp index a84d8db1c8794..e36eefaa6f4a4 100644 --- a/lldb/source/Host/common/NativeProcessProtocol.cpp +++ b/lldb/source/Host/common/NativeProcessProtocol.cpp @@ -215,17 +215,17 @@ Status NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) { for (const auto &thread : m_threads) { assert(thread && "thread list should not have a NULL thread!"); - const Status thread_error = thread->RemoveWatchpoint(addr); + Status thread_error = thread->RemoveWatchpoint(addr); if (thread_error.Fail()) { // Keep track of the first thread error if any threads fail. We want to // try to remove the watchpoint from every thread, though, even if one or // more have errors. if (!overall_error.Fail()) - overall_error = thread_error; + overall_error = std::move(thread_error); } } - const Status error = m_watchpoint_list.Remove(addr); - return overall_error.Fail() ? overall_error : error; + Status error = m_watchpoint_list.Remove(addr); + return overall_error.Fail() ? std::move(overall_error) : std::move(error); } const HardwareBreakpointMap & diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp index 2a2fcf00c0adf..d0cc68826d4bb 100644 --- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp +++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp @@ -233,7 +233,7 @@ ConnectionStatus ConnectionFileDescriptor::Disconnect(Status *error_ptr) { if (error.Fail()) status = eConnectionStatusError; if (error_ptr) - *error_ptr = error; + *error_ptr = std::move(error); // Close any pipes we were using for async interrupts m_pipe.Close(); @@ -295,7 +295,7 @@ size_t ConnectionFileDescriptor::Read(void *dst, size_t dst_len, } if (error_ptr) - *error_ptr = error; + *error_ptr = error.Clone(); if (error.Fail()) { uint32_t error_value = error.GetError(); @@ -393,7 +393,7 @@ size_t ConnectionFileDescriptor::Write(const void *src, size_t src_len, } if (error_ptr) - *error_ptr = error; + *error_ptr = error.Clone(); if (error.Fail()) { switch (error.GetError()) { @@ -476,7 +476,7 @@ ConnectionFileDescriptor::BytesAvailable(const Timeout &timeout, Status error = select_helper.Select(); if (error_ptr) - *error_ptr = error; + *error_ptr = error.Clone(); if (error.Fail()) { switch (error.GetError()) { @@ -557,7 +557,7 @@ lldb::ConnectionStatus ConnectionFileDescriptor::AcceptSocket( } if (error_ptr) - *error_ptr = error; + *error_ptr = error.Clone(); return eConnectionStatusError; } @@ -579,7 +579,7 @@ ConnectionFileDescriptor::ConnectSocket(Socket::SocketProtocol socket_protocol, } if (error_ptr) - *error_ptr = error; + *error_ptr = error.Clone(); return eConnectionStatusError; } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index df539d5f5bcee..b93f47a8a8d5e 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -1861,7 +1861,7 @@ CommandInterpreter::PreprocessToken(std::string &expr_str) { // But if for some reason we didn't get a value object at all, then we will // make up some helpful errors from the expression result. if (expr_result_valobj_sp) - error = expr_result_valobj_sp->GetError(); + error = expr_result_valobj_sp->GetError().Clone(); if (error.Success()) { std::string result = lldb_private::toString(expr_result) + diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 079ab9044de63..8b55221da6e76 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -96,7 +96,7 @@ lldb::ProcessLaunchInfoSP ScriptInterpreter::GetOpaqueTypeFromSBLaunchInfo( Status ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const { if (error.m_opaque_up) - return *error.m_opaque_up; + return error.m_opaque_up->Clone(); return Status(); } diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.cpp b/lldb/source/Plugins/Platform/Android/AdbClient.cpp index 00e66f8818f07..a179260ca15f6 100644 --- a/lldb/source/Plugins/Platform/Android/AdbClient.cpp +++ b/lldb/source/Plugins/Platform/Android/AdbClient.cpp @@ -173,7 +173,7 @@ Status AdbClient::SetPortForwarding(const uint16_t local_port, snprintf(message, sizeof(message), "forward:tcp:%d;tcp:%d", local_port, remote_port); - const auto error = SendDeviceMessage(message); + Status error = SendDeviceMessage(message); if (error.Fail()) return error; @@ -192,7 +192,7 @@ AdbClient::SetPortForwarding(const uint16_t local_port, snprintf(message, sizeof(message), "forward:tcp:%d;%s:%s", local_port, sock_namespace_str, remote_socket_name.str().c_str()); - const auto error = SendDeviceMessage(message); + Status error = SendDeviceMessage(message); if (error.Fail()) return error; @@ -203,7 +203,7 @@ Status AdbClient::DeletePortForwarding(const uint16_t local_port) { char message[32]; snprintf(message, sizeof(message), "killforward:tcp:%d", local_port); - const auto error = SendDeviceMessage(message); + Status error = SendDeviceMessage(message); if (error.Fail()) return error; @@ -588,7 +588,7 @@ AdbClient::SyncService::executeCommand(const std::function &cmd) { if (!m_conn) return Status::FromErrorString("SyncService is disconnected"); - const auto error = cmd(); + Status error = cmd(); if (error.Fail()) m_conn.reset(); diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp index de7031701df4d..d18b718d4a56c 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp @@ -189,8 +189,8 @@ Status PlatformAndroidRemoteGDBServer::MakeConnectURL( Status error; auto forward = [&](const uint16_t local, const uint16_t remote) { - error = ForwardPortWithAdb(local, remote, remote_socket_name, - m_socket_namespace, m_device_id); + Status error = ForwardPortWithAdb(local, remote, remote_socket_name, + m_socket_namespace, m_device_id); if (error.Success()) { m_port_forwards[pid] = local; std::ostringstream url_str; diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h index b2956054d50bf..57dd32f587a65 100644 --- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h +++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h @@ -34,7 +34,7 @@ class Process { explicit operator bool() { return m_pid != LLDB_INVALID_PROCESS_ID; } - lldb_private::Status GetError() { return m_error; } + lldb_private::Status GetError() { return m_error.Clone(); } private: Process(lldb::pid_t p); diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm index 303a5409c6fe4..f3e79d3d56154 100644 --- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm +++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm @@ -61,10 +61,10 @@ - (BOOL)spawnWithPath:(NSString *)path CoreSimulatorSupport::Process::Process(lldb::pid_t p) : m_pid(p), m_error() {} CoreSimulatorSupport::Process::Process(Status error) - : m_pid(LLDB_INVALID_PROCESS_ID), m_error(error) {} + : m_pid(LLDB_INVALID_PROCESS_ID), m_error(std::move(error)) {} CoreSimulatorSupport::Process::Process(lldb::pid_t p, Status error) - : m_pid(p), m_error(error) {} + : m_pid(p), m_error(std::move(error)) {} CoreSimulatorSupport::DeviceType::DeviceType() : m_model_identifier() {} @@ -498,19 +498,19 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info, STDIN_FILENO, stdin_file); if (error.Fail()) - return CoreSimulatorSupport::Process(error); + return CoreSimulatorSupport::Process(std::move(error)); error = HandleFileAction(launch_info, options, kSimDeviceSpawnStdout, STDOUT_FILENO, stdout_file); if (error.Fail()) - return CoreSimulatorSupport::Process(error); + return CoreSimulatorSupport::Process(std::move(error)); error = HandleFileAction(launch_info, options, kSimDeviceSpawnStderr, STDERR_FILENO, stderr_file); if (error.Fail()) - return CoreSimulatorSupport::Process(error); + return CoreSimulatorSupport::Process(std::move(error)); #undef kSimDeviceSpawnEnvironment #undef kSimDeviceSpawnStdin @@ -539,7 +539,7 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info, : "unable to launch"); } - return CoreSimulatorSupport::Process(pid, error); + return CoreSimulatorSupport::Process(pid, std::move(error)); } CoreSimulatorSupport::DeviceSet diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp index 70b49c0424bbb..31315e46ca168 100644 --- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp +++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp @@ -544,7 +544,7 @@ Status PlatformPOSIX::EvaluateLibdlExpression( return expr_error; if (result_valobj_sp->GetError().Fail()) - return result_valobj_sp->GetError(); + return result_valobj_sp->GetError().Clone(); return Status(); } @@ -976,7 +976,7 @@ Status PlatformPOSIX::UnloadImage(lldb_private::Process *process, return error; if (result_valobj_sp->GetError().Fail()) - return result_valobj_sp->GetError(); + return result_valobj_sp->GetError().Clone(); Scalar scalar; if (result_valobj_sp->ResolveValue(scalar)) { diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp index 8173b7c3b5003..7352d6f33f217 100644 --- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp +++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp @@ -444,7 +444,7 @@ Status PlatformWindows::UnloadImage(Process *process, uint32_t image_token) { return result; if (value->GetError().Fail()) - return value->GetError(); + return value->GetError().Clone(); Scalar scalar; if (value->ResolveValue(scalar)) { @@ -805,7 +805,7 @@ extern "C" { return error; if (value->GetError().Fail()) - return value->GetError(); + return value->GetError().Clone(); return Status(); } diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp index 8ba31b31e8dc1..f5fc337a8028e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp @@ -111,7 +111,7 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr, // If there was an error on the python call, surface it to the user. if (py_error.Fail()) - error = py_error; + error = std::move(py_error); if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) @@ -128,7 +128,7 @@ lldb::DataExtractorSP ScriptedProcessPythonInterface::ReadMemoryAtAddress( // If there was an error on the python call, surface it to the user. if (py_error.Fail()) - error = py_error; + error = std::move(py_error); return data_sp; } @@ -145,7 +145,7 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress( // If there was an error on the python call, surface it to the user. if (py_error.Fail()) - error = py_error; + error = std::move(py_error); return obj->GetUnsignedIntegerValue(LLDB_INVALID_OFFSET); } diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index c1dcdc7df6cee..cb6f6ec398926 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -309,8 +309,12 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::PythonBoolean(arg); } - python::PythonObject Transform(Status arg) { - return python::SWIGBridge::ToSWIGWrapper(arg); + python::PythonObject Transform(const Status &arg) { + return python::SWIGBridge::ToSWIGWrapper(arg.Clone()); + } + + python::PythonObject Transform(Status &&arg) { + return python::SWIGBridge::ToSWIGWrapper(std::move(arg)); } python::PythonObject Transform(const StructuredDataImpl &arg) { diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 5351c1a698b4a..97a3837fd7aa6 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -86,7 +86,7 @@ class SWIGBridge { static PythonObject ToSWIGWrapper(lldb::ProcessSP process_sp); static PythonObject ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp); static PythonObject ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp); - static PythonObject ToSWIGWrapper(const Status &status); + static PythonObject ToSWIGWrapper(Status status); static PythonObject ToSWIGWrapper(const StructuredDataImpl &data_impl); static PythonObject ToSWIGWrapper(lldb::ThreadSP thread_sp); static PythonObject ToSWIGWrapper(lldb::StackFrameSP frame_sp); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h index 148932d67b908..1267e20f08712 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h @@ -268,7 +268,7 @@ class DWARFUnit : public UserID { /// .dwo file. Things like a missing .dwo file, DWO ID mismatch, and other /// .dwo errors can be stored in each compile unit so the issues can be /// communicated to the user. - void SetDwoError(const Status &error) { m_dwo_error = error; } + void SetDwoError(Status &&error) { m_dwo_error = std::move(error); } protected: DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index ff44329d081ca..f721ca00fd355 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -4480,7 +4480,7 @@ Status SymbolFileDWARF::CalculateFrameVariableError(StackFrame &frame) { dwarf_cu->ExtractUnitDIEIfNeeded(); const Status &dwo_error = dwarf_cu->GetDwoError(); if (dwo_error.Fail()) - return dwo_error; + return dwo_error.Clone(); // Don't return an error for assembly files as they typically don't have // varaible information. diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp index 0cd2d06cd708c..08ea4c6d1645a 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp @@ -1574,7 +1574,7 @@ Status SymbolFileDWARFDebugMap::CalculateFrameVariableError(StackFrame &frame) { // we weren't able to open the .o file. Display an appropriate // error if (comp_unit_info->oso_load_error.Fail()) - return comp_unit_info->oso_load_error; + return comp_unit_info->oso_load_error.Clone(); else return Status::FromErrorStringWithFormat( "unable to load debug map object file \"%s\" " diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp index ccae7ea106c97..f737836e0d971 100644 --- a/lldb/source/Target/ModuleCache.cpp +++ b/lldb/source/Target/ModuleCache.cpp @@ -139,7 +139,7 @@ Status CreateHostSysRootModuleLink(const FileSpec &root_dir_spec, DecrementRefExistingModule(root_dir_spec, sysroot_module_path_spec); } - const auto error = MakeDirectory( + Status error = MakeDirectory( FileSpec(sysroot_module_path_spec.GetDirectory().AsCString())); if (error.Fail()) return error; diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp index 7792edcc2cb58..3e7546048e586 100644 --- a/lldb/source/Target/Platform.cpp +++ b/lldb/source/Target/Platform.cpp @@ -549,7 +549,7 @@ Status Platform::Install(const FileSpec &src, const FileSpec &dst) { RecurseCopyBaton baton = {recurse_dst, this, Status()}; FileSystem::Instance().EnumerateDirectory( src_dir_path, true, true, true, RecurseCopy_Callback, &baton); - return baton.error; + return std::move(baton.error); } } break; @@ -1566,7 +1566,7 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec, // resolved_module_spec. // Trying to find a module by UUID on local file system. - const Status error = module_resolver(resolved_module_spec); + Status error = module_resolver(resolved_module_spec); if (error.Success()) { if (module_sp && symbol_file_spec) { // Set the symbol file to the module if the locate modudle callback was diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index f2a631a466b35..40f3115f1ff6d 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -2147,7 +2147,6 @@ size_t Process::ReadCStringFromMemory(addr_t addr, char *dst, result_error.Clear(); // NULL out everything just to be safe memset(dst, 0, dst_max_len); - Status error; addr_t curr_addr = addr; const size_t cache_line_size = m_memory_cache.GetMemoryCacheLineSize(); size_t bytes_left = dst_max_len - 1; @@ -2158,10 +2157,11 @@ size_t Process::ReadCStringFromMemory(addr_t addr, char *dst, cache_line_size - (curr_addr % cache_line_size); addr_t bytes_to_read = std::min(bytes_left, cache_line_bytes_left); + Status error; size_t bytes_read = ReadMemory(curr_addr, curr_dst, bytes_to_read, error); if (bytes_read == 0) { - result_error = error; + result_error = std::move(error); dst[total_cstr_len] = '\0'; break; } diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 1610971a34148..fe0d4c93c5062 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1117,7 +1117,7 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) { frame_base = m_frame_base; if (error_ptr) - *error_ptr = m_frame_base_error; + *error_ptr = m_frame_base_error.Clone(); return m_frame_base_error.Success(); } diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index fcac0a48f46e6..f1c378b968d2b 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -1986,7 +1986,6 @@ size_t Target::ReadCStringFromMemory(const Address &addr, char *dst, result_error.Clear(); // NULL out everything just to be safe memset(dst, 0, dst_max_len); - Status error; addr_t curr_addr = addr.GetLoadAddress(this); Address address(addr); @@ -2003,11 +2002,12 @@ size_t Target::ReadCStringFromMemory(const Address &addr, char *dst, cache_line_size - (curr_addr % cache_line_size); addr_t bytes_to_read = std::min(bytes_left, cache_line_bytes_left); + Status error; size_t bytes_read = ReadMemory(address, curr_dst, bytes_to_read, error, force_live_memory); if (bytes_read == 0) { - result_error = error; + result_error = std::move(error); dst[total_cstr_len] = '\0'; break; } @@ -2401,7 +2401,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, } } if (error_ptr) - *error_ptr = error; + *error_ptr = std::move(error); return module_sp; } @@ -2730,7 +2730,7 @@ ExpressionResults Target::EvaluateExpression( // Pass up the error by wrapping it inside an error result. if (error.Fail() && !result_valobj_sp) result_valobj_sp = ValueObjectConstResult::Create( - exe_ctx.GetBestExecutionContextScope(), error); + exe_ctx.GetBestExecutionContextScope(), std::move(error)); } if (execution_results == eExpressionCompleted) @@ -3348,10 +3348,8 @@ Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) { else error = m_process_sp->Resume(); if (!error.Success()) { - Status error2; - error2 = Status::FromErrorStringWithFormat( + error = Status::FromErrorStringWithFormat( "process resume at entry point failed: %s", error.AsCString()); - error = error2; } } break; case eStateExited: { diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index 40e1fbf3fab1b..4af3af5fba018 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -107,6 +107,13 @@ llvm::Error Status::ToError() const { Status::~Status() = default; +const Status &Status::operator=(Status &&other) { + m_code = other.m_code; + m_type = other.m_type; + m_string = std::move(other.m_string); + return *this; +} + #ifdef _WIN32 static std::string RetrieveWin32ErrorString(uint32_t error_code) { char *buffer = nullptr; diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 0edde54d310fd..f2746c3e2516f 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -283,7 +283,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop( } python::PythonObject -lldb_private::python::SWIGBridge::ToSWIGWrapper(const Status &status) { +lldb_private::python::SWIGBridge::ToSWIGWrapper(Status status) { return python::PythonObject(); } diff --git a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp index d7810b20af95d..3278674ed0a05 100644 --- a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp +++ b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp @@ -33,8 +33,8 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform { MOCK_METHOD0(CalculateTrapHandlerSymbolNames, void()); MOCK_METHOD2(ResolveExecutable, - std::pair(const ModuleSpec &, - const FileSpecList *)); + std::pair(const ModuleSpec &, + const FileSpecList *)); Status ResolveExecutable(const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp, @@ -42,7 +42,7 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform { { // NOLINT(modernize-use-override) auto pair = ResolveExecutable(module_spec, module_search_paths_ptr); exe_module_sp = pair.second; - return pair.first; + return pair.first ? Status() : Status::FromErrorString("error"); } void SetRemotePlatform(lldb::PlatformSP platform) { @@ -81,7 +81,7 @@ TEST_F(RemoteAwarePlatformTest, TestResolveExecutabelOnClientByPlatform) { EXPECT_CALL(platform, GetSupportedArchitectures(process_host_arch)) .WillRepeatedly(Return(std::vector())); EXPECT_CALL(platform, ResolveExecutable(_, _)) - .WillRepeatedly(Return(std::make_pair(Status(), expected_executable))); + .WillRepeatedly(Return(std::make_pair(true, expected_executable))); platform.SetRemotePlatform(std::make_shared(false)); From 7b760894f247f4fa1b27c01c767c8599c169f996 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 12:49:16 -0700 Subject: [PATCH 285/425] [lldb] Convert NativeProcessLinux to new Status API (NFC) --- lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp index cc0e34eecdf30..cea3fbf9112f5 100644 --- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp @@ -1096,9 +1096,9 @@ Status NativeProcessLinux::Detach() { for (const auto &thread : m_threads) { Status e = Detach(thread->GetID()); + // Save the error, but still attempt to detach from other threads. if (e.Fail()) - error = - e; // Save the error, but still attempt to detach from other threads. + error = e.Clone; } m_intel_pt_collector.Clear(); @@ -1905,13 +1905,13 @@ Status NativeProcessLinux::ResumeThread(NativeThreadLinux &thread, // reflect it is running after this completes. switch (state) { case eStateRunning: { - const auto resume_result = thread.Resume(signo); + Status resume_result = thread.Resume(signo); if (resume_result.Success()) SetState(eStateRunning, true); return resume_result; } case eStateStepping: { - const auto step_result = thread.SingleStep(signo); + Status step_result = thread.SingleStep(signo); if (step_result.Success()) SetState(eStateRunning, true); return step_result; From 54b10555c32e9677ce15c408296f92b35cd3d29c Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 5 Sep 2024 15:51:00 -0400 Subject: [PATCH 286/425] [OpenMP] LIBOMPTARGET_DEVICE_ARCHITECTURES requires semicolons (#107454) If I use commas to delimit architectures in `LIBOMPTARGET_DEVICE_ARCHITECTURES`, cmake for the runtimes complains: ``` Unknown GPU architecture 'sm_70,sm_80,sm_90' ``` Semicolons are required instead. --- openmp/docs/SupportAndFAQ.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst index a158422befd07..cd2d6a4703221 100644 --- a/openmp/docs/SupportAndFAQ.rst +++ b/openmp/docs/SupportAndFAQ.rst @@ -81,9 +81,9 @@ The Cuda SDK is required on the machine that will execute the openmp application If your build machine is not the target machine or automatic detection of the available GPUs failed, you should also set: -- ``LIBOMPTARGET_DEVICE_ARCHITECTURES=sm_,...`` where ```` is the numeric +- ``LIBOMPTARGET_DEVICE_ARCHITECTURES='sm_;...'`` where ```` is the numeric compute capability of your GPU. For instance, set - ``LIBOMPTARGET_DEVICE_ARCHITECTURES=sm_70,sm_80`` to target the Nvidia Volta + ``LIBOMPTARGET_DEVICE_ARCHITECTURES='sm_70;sm_80'`` to target the Nvidia Volta and Ampere architectures. @@ -141,9 +141,9 @@ With those libraries installed, then LLVM build and installed, try: If your build machine is not the target machine or automatic detection of the available GPUs failed, you should also set: -- ``LIBOMPTARGET_DEVICE_ARCHITECTURES=gfx,...`` where ```` is the +- ``LIBOMPTARGET_DEVICE_ARCHITECTURES='gfx;...'`` where ```` is the shader core instruction set architecture. For instance, set - ``LIBOMPTARGET_DEVICE_ARCHITECTURES=gfx906,gfx90a`` to target AMD GCN5 + ``LIBOMPTARGET_DEVICE_ARCHITECTURES='gfx906;gfx90a'`` to target AMD GCN5 and CDNA2 devices. Q: What are the known limitations of OpenMP AMDGPU offload? From 3b426a8951caa543b65f20ff265353fd79f436e5 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 12:53:08 -0700 Subject: [PATCH 287/425] [lldb] Convert NativeProcessLinux to new Status API (NFC) --- lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp index cea3fbf9112f5..5c262db8db7fd 100644 --- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp @@ -1098,7 +1098,7 @@ Status NativeProcessLinux::Detach() { Status e = Detach(thread->GetID()); // Save the error, but still attempt to detach from other threads. if (e.Fail()) - error = e.Clone; + error = e.Clone(); } m_intel_pt_collector.Clear(); From 3726f9c57537aff05bd6ecf309133ce05bebaf43 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 5 Sep 2024 13:10:37 -0700 Subject: [PATCH 288/425] [Attributor][NFC] Pre-commits for #107439 (#107457) --- .../Attributor/ArgumentPromotion/byval.ll | 8 +- .../IPConstantProp/2009-09-24-byval-ptr.ll | 8 +- .../IPConstantProp/return-constants.ll | 12 +- .../Transforms/Attributor/assumes_info.ll | 2 + .../Attributor/cb_liveness_disabled.ll | 1 + .../Attributor/cb_liveness_enabled.ll | 1 + .../Attributor/phi_bug_pointer_info.ll | 26 +++ .../Attributor/value-simplify-local-remote.ll | 47 ++-- .../Attributor/value-simplify-pointer-info.ll | 212 ++++++++++++++++++ 9 files changed, 284 insertions(+), 33 deletions(-) diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll index 621c6cf94313e..516b80b1e237b 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll @@ -59,12 +59,12 @@ define i32 @main() nounwind { ; TUNIT-NEXT: store i32 1, ptr [[S]], align 32 ; TUNIT-NEXT: [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1 ; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[S]], align 8 -; TUNIT-NEXT: [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4 -; TUNIT-NEXT: [[TMP1:%.*]] = load i64, ptr [[S_B41]], align 8 +; TUNIT-NEXT: [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8 ; TUNIT-NEXT: [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR1:[0-9]+]] ; TUNIT-NEXT: [[TMP2:%.*]] = load i32, ptr [[S]], align 32 -; TUNIT-NEXT: [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4 -; TUNIT-NEXT: [[TMP3:%.*]] = load i64, ptr [[S_B4]], align 32 +; TUNIT-NEXT: [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4 +; TUNIT-NEXT: [[TMP3:%.*]] = load i64, ptr [[S_B41]], align 32 ; TUNIT-NEXT: [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR1]] ; TUNIT-NEXT: [[A:%.*]] = add i32 [[C0]], [[C1]] ; TUNIT-NEXT: ret i32 [[A]] diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll index a209ee2ebe064..249119a77a4d0 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll @@ -139,12 +139,12 @@ define i32 @unions_v2() nounwind { ; TUNIT-SAME: () #[[ATTR2]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: [[TMP0:%.*]] = load i8, ptr @mystr, align 8 -; TUNIT-NEXT: [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4 -; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8 +; TUNIT-NEXT: [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8 ; TUNIT-NEXT: call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]] ; TUNIT-NEXT: [[TMP2:%.*]] = load i8, ptr @mystr, align 8 -; TUNIT-NEXT: [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4 -; TUNIT-NEXT: [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8 +; TUNIT-NEXT: [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4 +; TUNIT-NEXT: [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8 ; TUNIT-NEXT: [[RESULT:%.*]] = call i32 @vfu2_v2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3]] ; TUNIT-NEXT: ret i32 [[RESULT]] ; diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll index 343b6b9dd433c..d1c331ce19651 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll @@ -62,13 +62,13 @@ define %0 @caller(i1 %Q) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@caller ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] { -; TUNIT-NEXT: [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1:[0-9]+]] +; TUNIT-NEXT: [[X:%.*]] = call [[TMP0:%.*]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1:[0-9]+]] ; TUNIT-NEXT: ret [[TMP0]] [[X]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@caller ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1:[0-9]+]] { -; CGSCC-NEXT: [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2:[0-9]+]] +; CGSCC-NEXT: [[X:%.*]] = call [[TMP0:%.*]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2:[0-9]+]] ; CGSCC-NEXT: ret [[TMP0]] [[X]] ; %X = call %0 @foo(i1 %Q) @@ -87,10 +87,10 @@ define i32 @caller2(i1 %Q) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@caller2 ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] { -; TUNIT-NEXT: [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]] +; TUNIT-NEXT: [[X:%.*]] = call [[TMP0:%.*]] @[[FOO]](i1 noundef [[Q]]) #[[ATTR1]] ; TUNIT-NEXT: [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0 ; TUNIT-NEXT: [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1 -; TUNIT-NEXT: [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]] +; TUNIT-NEXT: [[Y:%.*]] = call [[TMP0]] @[[BAR:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]] ; TUNIT-NEXT: [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0 ; TUNIT-NEXT: [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1 ; TUNIT-NEXT: [[M:%.*]] = add i32 [[A]], [[C]] @@ -101,10 +101,10 @@ define i32 @caller2(i1 %Q) { ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@caller2 ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1]] { -; CGSCC-NEXT: [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]] +; CGSCC-NEXT: [[X:%.*]] = call [[TMP0:%.*]] @[[FOO]](i1 noundef [[Q]]) #[[ATTR2]] ; CGSCC-NEXT: [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0 ; CGSCC-NEXT: [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1 -; CGSCC-NEXT: [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]] +; CGSCC-NEXT: [[Y:%.*]] = call [[TMP0]] @[[BAR:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]] ; CGSCC-NEXT: [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0 ; CGSCC-NEXT: [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1 ; CGSCC-NEXT: [[M:%.*]] = add i32 [[A]], [[C]] diff --git a/llvm/test/Transforms/Attributor/assumes_info.ll b/llvm/test/Transforms/Attributor/assumes_info.ll index df7d7ddd13356..088739ec5b812 100644 --- a/llvm/test/Transforms/Attributor/assumes_info.ll +++ b/llvm/test/Transforms/Attributor/assumes_info.ll @@ -119,9 +119,11 @@ attributes #3 = { "llvm.assume"="B,C,A" } ; TUNIT: attributes #[[ATTR0]] = { "llvm.assume"="A" } ; TUNIT: attributes #[[ATTR1]] = { "llvm.assume"="A,B" } ; TUNIT: attributes #[[ATTR2]] = { "llvm.assume"="A,B,C" } +; TUNIT: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="B,C,A" } ;. ; CGSCC: attributes #[[ATTR0]] = { "llvm.assume"="A" } ; CGSCC: attributes #[[ATTR1]] = { "llvm.assume"="A,B" } ; CGSCC: attributes #[[ATTR2]] = { "llvm.assume"="A,B,C" } ; CGSCC: attributes #[[ATTR3]] = { "llvm.assume"="B" } +; CGSCC: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="B,C,A" } ;. diff --git a/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll b/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll index f8f374ab66141..2f64cdbca2010 100644 --- a/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll +++ b/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll @@ -189,6 +189,7 @@ define i32 @test_ncheck2(i32 %0) #0 { attributes #0 = { noinline nounwind sspstrong uwtable} +;. ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind sspstrong willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nofree nosync nounwind willreturn memory(none) } ;. diff --git a/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll b/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll index 32c08ee92ddef..585b6ef10c4fe 100644 --- a/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll +++ b/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll @@ -192,6 +192,7 @@ define i32 @test_ncheck2(i32 %0) #0 { attributes #0 = { noinline nounwind sspstrong uwtable} +;. ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind sspstrong willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nofree nosync nounwind willreturn memory(none) } ;. diff --git a/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll b/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll index bb423e10f2c72..b4a2192c25faa 100644 --- a/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll +++ b/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll @@ -17,7 +17,30 @@ ; CHECK: - c: %1 = load i32, ptr %val2, align 4 ; CHECK: - 6 - %ret = load i32, ptr %x, align 4 ; CHECK: - c: +;. +; CHECK: @globalBytes = internal global [1024 x i8] zeroinitializer +;. define dso_local i32 @phi_different_offsets(ptr nocapture %val, ptr nocapture %val2, i1 %cmp) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +; CHECK-LABEL: define dso_local i32 @phi_different_offsets +; CHECK-SAME: (ptr nocapture nofree readonly [[VAL:%.*]], ptr nocapture nofree readonly [[VAL2:%.*]], i1 noundef [[CMP:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[FIELD2:%.*]] = getelementptr i32, ptr @globalBytes, i32 2 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[FIELD2]], align 8 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[FIELD8:%.*]] = getelementptr i32, ptr @globalBytes, i32 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL2]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[FIELD8]], align 16 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[X:%.*]] = phi ptr [ [[FIELD2]], [[THEN]] ], [ [[FIELD8]], [[ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr [[X]], align 8 +; CHECK-NEXT: ret i32 [[RET]] +; entry: br i1 %cmp, label %then, label %else @@ -39,3 +62,6 @@ end: ret i32 %ret } +;. +; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn } +;. diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll index 20b52c3fcd85a..9ff51e97d6e15 100644 --- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll @@ -106,7 +106,7 @@ define ptr @t2(ptr %this, ptr %this.addr, ptr %this1) { ; TUNIT-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR1:[0-9]+]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @foo.1(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4:[0-9]+]] +; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[FOO_1:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4:[0-9]+]] ; TUNIT-NEXT: [[TEST_RET:%.*]] = extractvalue [[S]] [[CALL]], 0 ; TUNIT-NEXT: ret ptr [[TEST_RET]] ; @@ -115,7 +115,7 @@ define ptr @t2(ptr %this, ptr %this.addr, ptr %this1) { ; CGSCC-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR2:[0-9]+]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @foo.1(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8:[0-9]+]] +; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[FOO_1:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8:[0-9]+]] ; CGSCC-NEXT: [[TEST_RET:%.*]] = extractvalue [[S]] [[CALL]], 0 ; CGSCC-NEXT: ret ptr [[TEST_RET]] ; @@ -205,7 +205,7 @@ define ptr @foo(ptr %this, ptr %this.addr, ptr %this1) { ; TUNIT-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @bar.5(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]] +; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[BAR_5:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]] ; TUNIT-NEXT: [[FOO_RET:%.*]] = extractvalue [[S]] [[CALL]], 0 ; TUNIT-NEXT: ret ptr [[FOO_RET]] ; @@ -214,7 +214,7 @@ define ptr @foo(ptr %this, ptr %this.addr, ptr %this1) { ; CGSCC-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR2]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @bar.5(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]] +; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[BAR_5:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]] ; CGSCC-NEXT: [[FOO_RET:%.*]] = extractvalue [[S]] [[CALL]], 0 ; CGSCC-NEXT: ret ptr [[FOO_RET]] ; @@ -317,7 +317,7 @@ define weak_odr void @t3() { ; CHECK: for.cond: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[CALL4:%.*]] = call [[S_2:%.*]] @t3.helper() +; CHECK-NEXT: [[CALL4:%.*]] = call [[S_2:%.*]] @[[T3_HELPER:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]() ; CHECK-NEXT: ret void ; entry: @@ -380,8 +380,8 @@ define dso_local void @spam() { ; TUNIT-NEXT: store i32 [[X]], ptr [[TMP]], align 4 ; TUNIT-NEXT: br label [[BB16:%.*]] ; TUNIT: bb16: -; TUNIT-NEXT: [[TRUETMP18:%.*]] = icmp eq i32 [[X]], 0 -; TUNIT-NEXT: br i1 [[TRUETMP18]], label [[BB35:%.*]], label [[BB19:%.*]] +; TUNIT-NEXT: [[TMP18:%.*]] = icmp eq i32 [[X]], 0 +; TUNIT-NEXT: br i1 [[TMP18]], label [[BB35:%.*]], label [[BB19:%.*]] ; TUNIT: bb19: ; TUNIT-NEXT: br label [[BB23:%.*]] ; TUNIT: bb23: @@ -404,8 +404,8 @@ define dso_local void @spam() { ; CGSCC-NEXT: store i32 [[X]], ptr [[TMP]], align 4 ; CGSCC-NEXT: br label [[BB16:%.*]] ; CGSCC: bb16: -; CGSCC-NEXT: [[TRUETMP18:%.*]] = icmp eq i32 [[X]], 0 -; CGSCC-NEXT: br i1 [[TRUETMP18]], label [[BB35:%.*]], label [[BB19:%.*]] +; CGSCC-NEXT: [[TMP18:%.*]] = icmp eq i32 [[X]], 0 +; CGSCC-NEXT: br i1 [[TMP18]], label [[BB35:%.*]], label [[BB19:%.*]] ; CGSCC: bb19: ; CGSCC-NEXT: br label [[BB23:%.*]] ; CGSCC: bb23: @@ -467,7 +467,7 @@ define double @t4(ptr %this, ptr %this.addr, ptr %this1) { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: [[THIS_ADDR1:%.*]] = alloca ptr, i32 0, align 8 ; TUNIT-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @t4a(ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR5]] +; TUNIT-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[T4A:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR5]] ; TUNIT-NEXT: ret double 0.000000e+00 ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) @@ -476,7 +476,7 @@ define double @t4(ptr %this, ptr %this.addr, ptr %this1) { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: [[THIS_ADDR1:%.*]] = alloca ptr, i32 0, align 8 ; CGSCC-NEXT: store ptr [[THIS]], ptr [[THIS]], align 8 -; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @t4a(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]] +; CGSCC-NEXT: [[CALL:%.*]] = call [[S:%.*]] @[[T4A:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]] ; CGSCC-NEXT: [[TMP0:%.*]] = extractvalue [[S]] [[CALL]], 0 ; CGSCC-NEXT: ret double 0.000000e+00 ; @@ -615,12 +615,21 @@ entry: ; CGSCC: attributes #[[ATTR8]] = { nofree nounwind willreturn } ; CGSCC: attributes #[[ATTR9]] = { nofree nounwind willreturn memory(readwrite) } ;. -; CHECK: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]} -; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2} -; CHECK: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} +; TUNIT: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]} +; TUNIT: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; TUNIT: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; TUNIT: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; TUNIT: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2} +; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} +;. +; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]} +; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CGSCC: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CGSCC: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CGSCC: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2} +; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} ;. diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll index 06a52ce936eec..f7f92e3c87a62 100644 --- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll @@ -3176,6 +3176,216 @@ define internal i32 @recSimplify2() { ret i32 %r } +; TODO: Verify we do not return 10. +define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return +; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]] +; TUNIT-NEXT: ret i32 10 +; +; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return +; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[A:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: [[B:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR21]] +; CGSCC-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; CGSCC-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; CGSCC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; CGSCC-NEXT: ret i32 [[ADD]] +; +entry: + %A = alloca i32, align 4 + %B = alloca i32, align 4 + %call = call ptr @passthrough(ptr noundef %A) + %call1 = call ptr @passthrough(ptr noundef %B) + call void @write_both(ptr noundef %call, ptr noundef %call1) + %0 = load i32, ptr %A, align 4 + %1 = load i32, ptr %B, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define internal void @write_both(ptr noundef %Q, ptr noundef %R) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) +; CHECK-LABEL: define {{[^@]+}}@write_both +; CHECK-SAME: (ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[Q:%.*]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 3, ptr [[Q]], align 4 +; CHECK-NEXT: store i32 5, ptr [[R]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 3, ptr %Q, align 4 + store i32 5, ptr %R, align 4 + ret void +} + +define internal ptr @passthrough(ptr noundef %P) { +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@passthrough +; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: ret ptr [[P]] +; +entry: + ret ptr %P +} + +; TODO: Verify we do not return 10. +define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c) { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_choice +; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR4]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]]) #[[ATTR23:[0-9]+]] +; TUNIT-NEXT: [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR23]] +; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR18]] +; TUNIT-NEXT: ret i32 10 +; +; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn +; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_choice +; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR3]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[A:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: [[B:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[B]]) #[[ATTR28:[0-9]+]] +; CGSCC-NEXT: [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[A]]) #[[ATTR28]] +; CGSCC-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR21]] +; CGSCC-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; CGSCC-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; CGSCC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; CGSCC-NEXT: ret i32 [[ADD]] +; +entry: + %A = alloca i32, align 4 + %B = alloca i32, align 4 + %call = call ptr @passthrough_choice(i1 %c, ptr noundef %A, ptr noundef %B) + %call1 = call ptr @passthrough_choice(i1 %c, ptr noundef %B, ptr noundef %A) + call void @write_both(ptr noundef %call, ptr noundef %call1) + %0 = load i32, ptr %A, align 4 + %1 = load i32, ptr %B, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define internal ptr @passthrough_choice(i1 %c, ptr noundef %P, ptr noundef %Q) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define {{[^@]+}}@passthrough_choice +; CHECK-SAME: (i1 [[C:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], ptr [[P]], ptr [[Q]] +; CHECK-NEXT: ret ptr [[R]] +; +entry: + %R = select i1 %c, ptr %P, ptr %Q + ret ptr %R +} + +; TODO: Verify we do not return 10. +define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1 +; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]] +; TUNIT-NEXT: ret i32 10 +; +; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1 +; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[A:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: [[B:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR21]] +; CGSCC-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; CGSCC-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; CGSCC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; CGSCC-NEXT: ret i32 [[ADD]] +; +entry: + %A = alloca i32, align 4 + %B = alloca i32, align 4 + %call = call ptr @passthrough_no_choice_true(i1 true, ptr noundef %A, ptr noundef %B) + %call1 = call ptr @passthrough_no_choice_true(i1 true, ptr noundef %B, ptr noundef %A) + call void @write_both(ptr noundef %call, ptr noundef %call1) + %0 = load i32, ptr %A, align 4 + %1 = load i32, ptr %B, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +; TODO: Verify we do not return 10. +define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2 +; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 +; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR18]] +; TUNIT-NEXT: ret i32 10 +; +; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2 +; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[A:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: [[B:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR21]] +; CGSCC-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; CGSCC-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; CGSCC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; CGSCC-NEXT: ret i32 [[ADD]] +; +entry: + %A = alloca i32, align 4 + %B = alloca i32, align 4 + %call = call ptr @passthrough_no_choice_false(i1 false, ptr noundef %A, ptr noundef %B) + %call1 = call ptr @passthrough_no_choice_false(i1 false, ptr noundef %B, ptr noundef %A) + call void @write_both(ptr noundef %call, ptr noundef %call1) + %0 = load i32, ptr %A, align 4 + %1 = load i32, ptr %B, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define internal ptr @passthrough_no_choice_true(i1 %c, ptr noundef %P, ptr noundef %Q) { +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_true +; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], i32 [[TMP0:%.*]]) #[[ATTR4]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[Q_PRIV:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: store i32 [[TMP0]], ptr [[Q_PRIV]], align 4 +; CGSCC-NEXT: ret ptr [[P]] +; +entry: + %R = select i1 %c, ptr %P, ptr %Q + ret ptr %R +} +define internal ptr @passthrough_no_choice_false(i1 %c, ptr noundef %P, ptr noundef %Q) { +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_false +; CGSCC-SAME: (i32 [[TMP0:%.*]], ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[P_PRIV:%.*]] = alloca i32, align 4 +; CGSCC-NEXT: store i32 [[TMP0]], ptr [[P_PRIV]], align 4 +; CGSCC-NEXT: ret ptr [[Q]] +; +entry: + %R = select i1 %c, ptr %P, ptr %Q + ret ptr %R +} + declare void @llvm.assume(i1 noundef) @@ -3238,6 +3448,7 @@ declare void @llvm.assume(i1 noundef) ; TUNIT: attributes #[[ATTR20]] = { norecurse } ; TUNIT: attributes #[[ATTR21]] = { nounwind } ; TUNIT: attributes #[[ATTR22]] = { nofree nosync nounwind willreturn } +; TUNIT: attributes #[[ATTR23]] = { nofree nosync nounwind willreturn memory(none) } ;. ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) } @@ -3267,6 +3478,7 @@ declare void @llvm.assume(i1 noundef) ; CGSCC: attributes #[[ATTR25]] = { nofree nounwind } ; CGSCC: attributes #[[ATTR26]] = { nofree nounwind willreturn } ; CGSCC: attributes #[[ATTR27]] = { nofree } +; CGSCC: attributes #[[ATTR28]] = { nofree nosync willreturn } ;. ; TUNIT: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1} From 4ce8808dd96dd6f1380c4e27c04ff0a0a0fed12b Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 5 Sep 2024 21:12:48 +0100 Subject: [PATCH 289/425] [AMDGPU] Common up default value of -amdgpu-nsa-threshold. NFC. The default value of 3 was specified in two places. Use the actual value of the cl::init to avoid repeating it. --- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 352994e541fc8..52c24a5c25ec2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -708,7 +708,7 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { if (Value > 0) return std::max(Value, 2); - return 3; + return NSAThreshold; } GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, From e6dece9f6947a50c36f714d3fc0d86c6ad9acc9b Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 5 Sep 2024 13:33:58 -0700 Subject: [PATCH 290/425] [Attributor][FIX] Mark "may" accesses through call sites as such (#107439) Before, we kept the call site access kind (may/must) when we translated the access. However, the pointer we access it through (by passing it to the callee) might not be the underlying object. We have similar logic when we add store and load accesses. --- .../Transforms/IPO/AttributorAttributes.cpp | 20 ++++++++----- .../Attributor/value-simplify-pointer-info.ll | 28 +++++++++++++------ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 69d29b6c04234..6b6d6d8d2a1e4 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1397,7 +1397,8 @@ struct AAPointerInfoImpl } ChangeStatus translateAndAddState(Attributor &A, const AAPointerInfo &OtherAA, - const OffsetInfo &Offsets, CallBase &CB) { + const OffsetInfo &Offsets, CallBase &CB, + bool IsMustAcc) { using namespace AA::PointerInfo; if (!OtherAA.getState().isValidState() || !isValidState()) return indicatePessimisticFixpoint(); @@ -1410,6 +1411,8 @@ struct AAPointerInfoImpl for (const auto &It : State) { for (auto Index : It.getSecond()) { const auto &RAcc = State.getAccess(Index); + if (!IsMustAcc && RAcc.isAssumption()) + continue; for (auto Offset : Offsets) { auto NewRanges = Offset == AA::RangeTy::Unknown ? AA::RangeTy::getUnknown() @@ -1417,9 +1420,11 @@ struct AAPointerInfoImpl if (!NewRanges.isUnknown()) { NewRanges.addToAllOffsets(Offset); } - Changed |= - addAccess(A, NewRanges, CB, RAcc.getContent(), RAcc.getKind(), - RAcc.getType(), RAcc.getRemoteInst()); + AccessKind AK = RAcc.getKind(); + if (!IsMustAcc) + AK = AccessKind((AK & ~AK_MUST) | AK_MAY); + Changed |= addAccess(A, NewRanges, CB, RAcc.getContent(), AK, + RAcc.getType(), RAcc.getRemoteInst()); } } } @@ -1893,9 +1898,10 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) { DepClassTy::REQUIRED); if (!CSArgPI) return false; - Changed = - translateAndAddState(A, *CSArgPI, OffsetInfoMap[CurPtr], *CB) | - Changed; + bool IsMustAcc = (getUnderlyingObject(CurPtr) == &AssociatedValue); + Changed = translateAndAddState(A, *CSArgPI, OffsetInfoMap[CurPtr], *CB, + IsMustAcc) | + Changed; return isValidState(); } LLVM_DEBUG(dbgs() << "[AAPointerInfo] Call user not handled " << *CB diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll index f7f92e3c87a62..69bff7b5e783e 100644 --- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll @@ -3176,7 +3176,7 @@ define internal i32 @recSimplify2() { ret i32 %r } -; TODO: Verify we do not return 10. +; Verify we do not return 10. define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return @@ -3185,7 +3185,10 @@ define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) { ; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]] -; TUNIT-NEXT: ret i32 10 +; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; TUNIT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; TUNIT-NEXT: ret i32 [[ADD]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return @@ -3237,7 +3240,7 @@ entry: ret ptr %P } -; TODO: Verify we do not return 10. +; Verify we do not return 10. define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_choice @@ -3248,7 +3251,10 @@ define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c ; TUNIT-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]]) #[[ATTR23:[0-9]+]] ; TUNIT-NEXT: [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR23]] ; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR18]] -; TUNIT-NEXT: ret i32 10 +; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; TUNIT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; TUNIT-NEXT: ret i32 [[ADD]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_choice @@ -3289,7 +3295,7 @@ entry: ret ptr %R } -; TODO: Verify we do not return 10. +; Verify we do not return 10. define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1 @@ -3298,7 +3304,10 @@ define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) { ; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]] -; TUNIT-NEXT: ret i32 10 +; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; TUNIT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; TUNIT-NEXT: ret i32 [[ADD]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1 @@ -3324,7 +3333,7 @@ entry: ret i32 %add } -; TODO: Verify we do not return 10. +; Verify we do not return 10. define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2 @@ -3333,7 +3342,10 @@ define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) { ; TUNIT-NEXT: [[A:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: [[B:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR18]] -; TUNIT-NEXT: ret i32 10 +; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +; TUNIT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +; TUNIT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]] +; TUNIT-NEXT: ret i32 [[ADD]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2 From 08533a3ee8f3a09a59cf6ac3be59198b26b7f739 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 5 Sep 2024 13:36:26 -0700 Subject: [PATCH 291/425] [Offload][NFC] Reorganize `utils::` and make Device/Host/Shared clearer (#100280) We had three `utils::` namespaces, all with different "meaning" (host, device, hsa_utils). We should, when we can, keep "include/Shared" accessible from host and device, thus RefCountTy has been moved to a separate header. `hsa_utils` was introduced to make `utils::` less overloaded. And common functionality was de-duplicated, e.g., `utils::advance` and `utils::advanceVoidPtr` -> `utils:advancePtr`. Type punning now checks for the size of the result to make sure it matches the source type. No functional change was intended. --- offload/DeviceRTL/CMakeLists.txt | 6 +- offload/DeviceRTL/include/Allocator.h | 2 +- offload/DeviceRTL/include/Configuration.h | 2 +- .../include/{Types.h => DeviceTypes.h} | 6 +- offload/DeviceRTL/include/DeviceUtils.h | 54 ++++++++++ offload/DeviceRTL/include/Interface.h | 2 +- offload/DeviceRTL/include/LibC.h | 2 +- offload/DeviceRTL/include/Mapping.h | 2 +- offload/DeviceRTL/include/State.h | 4 +- offload/DeviceRTL/include/Synchronization.h | 2 +- offload/DeviceRTL/include/Utils.h | 100 ------------------ offload/DeviceRTL/src/Allocator.cpp | 4 +- offload/DeviceRTL/src/Configuration.cpp | 2 +- offload/DeviceRTL/src/Debug.cpp | 2 +- .../src/{Utils.cpp => DeviceUtils.cpp} | 16 +-- offload/DeviceRTL/src/Kernel.cpp | 2 +- offload/DeviceRTL/src/Mapping.cpp | 4 +- offload/DeviceRTL/src/Misc.cpp | 2 +- offload/DeviceRTL/src/Parallelism.cpp | 4 +- offload/DeviceRTL/src/Reduction.cpp | 4 +- offload/DeviceRTL/src/State.cpp | 16 +-- offload/DeviceRTL/src/Synchronization.cpp | 4 +- offload/DeviceRTL/src/Tasking.cpp | 6 +- offload/DeviceRTL/src/Workshare.cpp | 6 +- offload/include/Shared/RefCnt.h | 56 ++++++++++ offload/include/Shared/Types.h | 22 ++++ offload/include/Shared/Utils.h | 100 ++++++++---------- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 88 +++++++-------- .../amdgpu/utils/UtilitiesRTL.h | 5 +- .../common/include/PluginInterface.h | 18 ++-- .../common/src/GlobalHandler.cpp | 4 +- offload/plugins-nextgen/common/src/JIT.cpp | 4 +- .../common/src/PluginInterface.cpp | 26 ++--- offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +- offload/src/DeviceImage.cpp | 5 +- offload/src/omptarget.cpp | 4 +- 36 files changed, 308 insertions(+), 280 deletions(-) rename offload/DeviceRTL/include/{Types.h => DeviceTypes.h} (97%) create mode 100644 offload/DeviceRTL/include/DeviceUtils.h delete mode 100644 offload/DeviceRTL/include/Utils.h rename offload/DeviceRTL/src/{Utils.cpp => DeviceUtils.cpp} (90%) create mode 100644 offload/include/Shared/RefCnt.h create mode 100644 offload/include/Shared/Types.h diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index ad465f0ccbd61..6b86c4d1ce0cf 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -80,8 +80,8 @@ set(include_files ${include_directory}/Profiling.h ${include_directory}/State.h ${include_directory}/Synchronization.h - ${include_directory}/Types.h - ${include_directory}/Utils.h + ${include_directory}/DeviceTypes.h + ${include_directory}/DeviceUtils.h ${include_directory}/Workshare.h ) @@ -99,7 +99,7 @@ set(src_files ${source_directory}/State.cpp ${source_directory}/Synchronization.cpp ${source_directory}/Tasking.cpp - ${source_directory}/Utils.cpp + ${source_directory}/DeviceUtils.cpp ${source_directory}/Workshare.cpp ) diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index 23e0106c80a2c..475f6a21bb47e 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_ALLOCATOR_H #define OMPTARGET_ALLOCATOR_H -#include "Types.h" +#include "DeviceTypes.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644 --- a/offload/DeviceRTL/include/Configuration.h +++ b/offload/DeviceRTL/include/Configuration.h @@ -15,7 +15,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { namespace config { diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h similarity index 97% rename from offload/DeviceRTL/include/Types.h rename to offload/DeviceRTL/include/DeviceTypes.h index cd8f925a392a8..c7132be345e0e 100644 --- a/offload/DeviceRTL/include/Types.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -1,4 +1,4 @@ -//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===// +//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -115,9 +115,9 @@ enum kmp_sched_t { #define SCHEDULE_WITHOUT_MODIFIERS(s) \ (enum kmp_sched_t)( \ (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0) #define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s)&kmp_sched_modifier_nonmonotonic) != 0) + (((s) & kmp_sched_modifier_nonmonotonic) != 0) #define SCHEDULE_HAS_NO_MODIFIERS(s) \ (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ 0) diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h new file mode 100644 index 0000000000000..549ca16e1c34c --- /dev/null +++ b/offload/DeviceRTL/include/DeviceUtils.h @@ -0,0 +1,54 @@ +//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H +#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H + +#include "DeviceTypes.h" +#include "Shared/Utils.h" + +#pragma omp begin declare target device_type(nohost) + +namespace utils { + +/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread +/// is identified by \p Mask. +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); + +int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); + +uint64_t ballotSync(uint64_t Mask, int32_t Pred); + +/// Return \p LowBits and \p HighBits packed into a single 64 bit value. +uint64_t pack(uint32_t LowBits, uint32_t HighBits); + +/// Unpack \p Val into \p LowBits and \p HighBits. +void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); + +/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). +bool isSharedMemPtr(void *Ptr); + +/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). +bool isThreadLocalMemPtr(void *Ptr); + +/// A pointer variable that has by design an `undef` value. Use with care. +[[clang::loader_uninitialized]] static void *const UndefPtr; + +#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) + +} // namespace utils + +#pragma omp end declare target + +#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index d36d4227091ef..c4bfaaa2404b4 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -14,7 +14,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" /// External API /// diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h index 59a795cc62e0e..03febdb508342 100644 --- a/offload/DeviceRTL/include/LibC.h +++ b/offload/DeviceRTL/include/LibC.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_LIBC_H #define OMPTARGET_LIBC_H -#include "Types.h" +#include "DeviceTypes.h" extern "C" { diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 165904644dbb9..2fb87abe5418c 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_MAPPING_H #define OMPTARGET_MAPPING_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 1a3490394458f..565235cd48a91 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -15,9 +15,9 @@ #include "Shared/Environment.h" #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Mapping.h" -#include "Types.h" -#include "Utils.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index af9e1a673e6a2..874974cc861df 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h deleted file mode 100644 index 82e2397b5958b..0000000000000 --- a/offload/DeviceRTL/include/Utils.h +++ /dev/null @@ -1,100 +0,0 @@ -//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_UTILS_H -#define OMPTARGET_DEVICERTL_UTILS_H - -#include "Types.h" - -#pragma omp begin declare target device_type(nohost) - -namespace ompx { -namespace utils { - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Round up \p V to a \p Boundary. -template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; -} - -/// Advance \p Ptr by \p Bytes bytes. -template inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) { - return reinterpret_cast(reinterpret_cast(Ptr) + Bytes); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_ffs(V); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_ffsl(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_popcount(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_popcountl(V); -} - -/// Return \p V aligned "upwards" according to \p Align. -template inline Ty1 align_up(Ty1 V, Ty2 Align) { - return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align); -} -/// Return \p V aligned "downwards" according to \p Align. -template inline Ty1 align_down(Ty1 V, Ty2 Align) { - return V - V % Align; -} - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return \p V typed punned as \p DstTy. -template inline DstTy convertViaPun(SrcTy V) { - return *((DstTy *)(&V)); -} - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils -} // namespace ompx - -#pragma omp end declare target - -#endif diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp index c9c940de62c1a..ac662c48d4f5f 100644 --- a/offload/DeviceRTL/src/Allocator.cpp +++ b/offload/DeviceRTL/src/Allocator.cpp @@ -12,10 +12,10 @@ #include "Allocator.h" #include "Configuration.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Mapping.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index ef0c3663536f5..9e14c203d4a04 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "Configuration.h" +#include "DeviceTypes.h" #include "State.h" -#include "Types.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp index 5a2c84c7ee38a..b451f17c6bbd8 100644 --- a/offload/DeviceRTL/src/Debug.cpp +++ b/offload/DeviceRTL/src/Debug.cpp @@ -14,10 +14,10 @@ #include "Configuration.h" #include "Debug.h" +#include "DeviceTypes.h" #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp similarity index 90% rename from offload/DeviceRTL/src/Utils.cpp rename to offload/DeviceRTL/src/DeviceUtils.cpp index 53cc803234867..c204a7be73b1f 100644 --- a/offload/DeviceRTL/src/Utils.cpp +++ b/offload/DeviceRTL/src/DeviceUtils.cpp @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -#include "Utils.h" +#include "DeviceUtils.h" #include "Debug.h" #include "Interface.h" @@ -33,7 +33,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, int32_t Width); @@ -44,8 +44,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred); ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - int Width = mapping::getWarpSize(); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { int Self = mapping::getThreadIdInWarp(); int Index = SrcLane + (Self & ~(Width - 1)); return __builtin_amdgcn_ds_bpermute(Index << 2, Var); @@ -77,8 +76,8 @@ bool isSharedMemPtr(const void *Ptr) { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1); } int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { @@ -104,8 +103,9 @@ void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { impl::Unpack(Val, &LowBits, &HighBits); } -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - return impl::shuffle(Mask, Var, SrcLane); +int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width) { + return impl::shuffle(Mask, Var, SrcLane, Width); } int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index e70704f25e922..8bb275eae776c 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -14,11 +14,11 @@ #include "Allocator.h" #include "Debug.h" +#include "DeviceTypes.h" #include "Interface.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" #include "Workshare.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index c1ce878746a69..3aefcff68e195 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -10,10 +10,10 @@ //===----------------------------------------------------------------------===// #include "Mapping.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" #pragma omp begin declare target device_type(nohost) diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index ce4a221bdb37d..8e690f6fd8e7c 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -11,7 +11,7 @@ #include "Allocator.h" #include "Configuration.h" -#include "Types.h" +#include "DeviceTypes.h" #include "Debug.h" diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 15b991f202539..5286d53b623f0 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -33,12 +33,12 @@ //===----------------------------------------------------------------------===// #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 744d1a3a231c8..57df159d3f28e 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -11,12 +11,12 @@ //===----------------------------------------------------------------------===// #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index f43f2cedb431d..855c74fa58e0a 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -13,13 +13,13 @@ #include "Allocator.h" #include "Configuration.h" #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "LibC.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; @@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy { /// Deallocate the last allocation made by the encountering thread and pointed /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint32_t Bytes); + void pop(void *Ptr, uint64_t Bytes); private: /// Compute the size of the storage space reserved for a thread. uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), - allocator::ALIGNMENT); + return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock), + allocator::ALIGNMENT); } /// Return the top address of the warp data stack, that is the first address @@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { // First align the number of requested bytes. /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); uint32_t StorageTotal = computeThreadStorageTotal(); @@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { return GlobalMemory; } -void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); +void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); if (utils::isSharedMemPtr(Ptr)) { int TId = mapping::getThreadIdInBlock(); Usage[TId] -= AlignedBytes; diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 80ba87b300bcd..d6452a5d589c5 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -13,11 +13,11 @@ #include "Synchronization.h" #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" -#include "Utils.h" #pragma omp begin declare target device_type(nohost) diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp index 2dc33562e6d79..23a967c1a337e 100644 --- a/offload/DeviceRTL/src/Tasking.cpp +++ b/offload/DeviceRTL/src/Tasking.cpp @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; @@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( TaskSizeTotal, "explicit task descriptor"); TaskDescriptor->Payload = - utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded); + utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); TaskDescriptor->TaskFn = TaskFn; return TaskDescriptor; diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 7e087a07e4420..ad60e66548be9 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -14,12 +14,12 @@ #include "Workshare.h" #include "Debug.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" using namespace ompx; @@ -349,7 +349,7 @@ template struct omptarget_nvptx_LoopSupport { if (rank == 0) { warp_res = atomic::add(&Cnt, change, atomic::seq_cst); } - warp_res = utils::shuffle(active, warp_res, leader); + warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); return warp_res + rank; } diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h new file mode 100644 index 0000000000000..7c615ba167a3d --- /dev/null +++ b/offload/include/Shared/RefCnt.h @@ -0,0 +1,56 @@ +//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_REF_CNT_H +#define OMPTARGET_SHARED_REF_CNT_H + +#include +#include +#include +#include + +namespace llvm { +namespace omp { +namespace target { + +/// Utility class for thread-safe reference counting. Any class that needs +/// objects' reference counting can inherit from this entity or have it as a +/// class data member. +template +struct RefCountTy { + /// Create a refcount object initialized to zero. + RefCountTy() : Refs(0) {} + + ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } + + /// Increase the reference count atomically. + void increase() { Refs.fetch_add(1, MemoryOrder); } + + /// Decrease the reference count and return whether it became zero. Decreasing + /// the counter in more units than it was previously increased results in + /// undefined behavior. + bool decrease() { + Ty Prev = Refs.fetch_sub(1, MemoryOrder); + assert(Prev > 0 && "Invalid refcount"); + return (Prev == 1); + } + + Ty get() const { return Refs.load(MemoryOrder); } + +private: + /// The atomic reference counter. + std::atomic Refs; +}; +} // namespace target +} // namespace omp +} // namespace llvm + +#endif diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h new file mode 100644 index 0000000000000..15e3cfefa37ed --- /dev/null +++ b/offload/include/Shared/Types.h @@ -0,0 +1,22 @@ +//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Environments shared between host and device. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_TYPES_H +#define OMPTARGET_SHARED_TYPES_H + +#ifndef OMPTARGET_DEVICE_RUNTIME +#include +#else +#include "DeviceTypes.h" +#endif + +#endif // OMPTARGET_SHARED_TYPES_H diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h index fce14b54edb98..da83551fffd54 100644 --- a/offload/include/Shared/Utils.h +++ b/offload/include/Shared/Utils.h @@ -6,83 +6,73 @@ // //===----------------------------------------------------------------------===// // -// Routines and classes used to provide useful functionalities like string -// parsing and environment variables. +// Routines and classes used to provide useful functionalities for the host and +// the device. // //===----------------------------------------------------------------------===// #ifndef OMPTARGET_SHARED_UTILS_H #define OMPTARGET_SHARED_UTILS_H -#include "llvm/ADT/StringRef.h" +#include "Types.h" -#include "Debug.h" - -#include -#include -#include -#include - -namespace llvm { -namespace omp { -namespace target { - -/// Utility class for thread-safe reference counting. Any class that needs -/// objects' reference counting can inherit from this entity or have it as a -/// class data member. -template -struct RefCountTy { - /// Create a refcount object initialized to zero. - RefCountTy() : Refs(0) {} - - ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } - - /// Increase the reference count atomically. - void increase() { Refs.fetch_add(1, MemoryOrder); } - - /// Decrease the reference count and return whether it became zero. Decreasing - /// the counter in more units than it was previously increased results in - /// undefined behavior. - bool decrease() { - Ty Prev = Refs.fetch_sub(1, MemoryOrder); - assert(Prev > 0 && "Invalid refcount"); - return (Prev == 1); - } - - Ty get() const { return Refs.load(MemoryOrder); } - -private: - /// The atomic reference counter. - std::atomic Refs; -}; +namespace utils { /// Return the difference (in bytes) between \p Begin and \p End. template -ptrdiff_t getPtrDiff(const void *End, const void *Begin) { +auto getPtrDiff(const void *End, const void *Begin) { return reinterpret_cast(End) - reinterpret_cast(Begin); } /// Return \p Ptr advanced by \p Offset bytes. -template Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) { - static_assert(std::is_void::value); - return const_cast(reinterpret_cast(Ptr) + Offset); +template Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) { + return (Ty1 *)(const_cast((const char *)(Ptr)) + Offset); } -/// Return \p Ptr aligned to \p Alignment bytes. -template Ty *alignPtr(Ty *Ptr, int64_t Alignment) { - size_t Space = std::numeric_limits::max(); - return std::align(Alignment, sizeof(char), Ptr, Space); +/// Return \p V aligned "upwards" according to \p Align. +template inline Ty1 alignPtr(Ty1 V, Ty2 Align) { + return reinterpret_cast(((uintptr_t(V) + Align - 1) / Align) * Align); +} +/// Return \p V aligned "downwards" according to \p Align. +template inline Ty1 alignDown(Ty1 V, Ty2 Align) { + return V - V % Align; } /// Round up \p V to a \p Boundary. template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; + return alignPtr(V, Boundary); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_ffs(V); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_ffsl(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_popcount(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_popcountl(V); +} + +template inline DstTy convertViaPun(SrcTy V) { + static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion"); + return *((DstTy *)(&V)); } -} // namespace target -} // namespace omp -} // namespace llvm +} // namespace utils #endif // OMPTARGET_SHARED_UTILS_H diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 86df4584db091..f0cc0c2e4d08e 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -26,6 +26,7 @@ #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" +#include "Shared/RefCnt.h" #include "Shared/Utils.h" #include "Utils/ELF.h" @@ -91,7 +92,7 @@ struct AMDGPUDeviceImageTy; struct AMDGPUMemoryManagerTy; struct AMDGPUMemoryPoolTy; -namespace utils { +namespace hsa_utils { /// Iterate elements using an HSA iterate function. Do not use this function /// directly but the specialized ones below instead. @@ -191,7 +192,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { std::string Target; - auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { uint32_t Length; hsa_status_t Status; Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); @@ -212,7 +213,7 @@ Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { return Err; return Target; } -} // namespace utils +} // namespace hsa_utils /// Utility class representing generic resource references to AMDGPU resources. template @@ -549,7 +550,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. - ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); + ImplicitArgsSize = + hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); // Get additional kernel info read from image @@ -1270,13 +1272,14 @@ struct AMDGPUStreamTy { // Issue the async memory copy. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Push an asynchronous memory copy device-to-host involving an unpinned @@ -1310,14 +1313,14 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, &InputSignalRaw, OutputSignals[0]->get())) return Err; } else { - if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, - Src, Agent, CopySize, 0, nullptr, - OutputSignals[0]->get())) + if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, + Agent, Src, Agent, CopySize, 0, + nullptr, OutputSignals[0]->get())) return Err; } @@ -1408,12 +1411,13 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead @@ -1437,13 +1441,13 @@ struct AMDGPUStreamTy { if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 0, nullptr, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Synchronize with the stream. The current thread waits until all operations @@ -1806,7 +1810,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy { Error retrieveAllMemoryPools() override { // Iterate through the available pools across the host agents. for (hsa_agent_t Agent : Agents) { - Error Err = utils::iterateAgentMemoryPools( + Error Err = hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = new AMDGPUMemoryPoolTy(HSAMemoryPool); @@ -1971,7 +1975,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Detect if XNACK is enabled auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(Agent); + hsa_utils::getTargetTripleAndFeatures(Agent); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); if (static_cast(*TargeTripleAndFeaturesOrError) @@ -2323,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, - Agent, PinnedPtr, Agent, Size, 0, - nullptr, Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, + Agent, PinnedPtr, Agent, Size, 0, + nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2383,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, - Agent, TgtPtr, Agent, Size, 0, nullptr, - Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), + PinnedPtr, Agent, TgtPtr, Agent, + Size, 0, nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2427,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) return Err; @@ -2693,7 +2697,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } Info.add("ISAs"); - auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) Info.add("Name", TmpChar); @@ -2775,7 +2779,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Retrieve and construct all memory pools of the device agent. Error retrieveAllMemoryPools() override { // Iterate through the available pools of the device agent. - return utils::iterateAgentMemoryPools( + return hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = Plugin.allocate(); @@ -2961,7 +2965,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = utils::readAMDGPUMetaDataFromImage( + if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; @@ -3090,7 +3094,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { llvm::SmallVector HostAgents; // Count the number of available agents. - auto Err = utils::iterateAgents([&](hsa_agent_t Agent) { + auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) { // Get the device type of the agent. hsa_device_type_t DeviceType; hsa_status_t Status = @@ -3185,7 +3189,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { return false; auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); + hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); return offloading::amdgpu::isImageCompatibleWithEnv( @@ -3333,11 +3337,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; - utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; + hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { // Initialize implicit arguments. - ImplArgs = reinterpret_cast( - advanceVoidPtr(AllArgs, LaunchParams.Size)); + ImplArgs = reinterpret_cast( + utils::advancePtr(AllArgs, LaunchParams.Size)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, getImplicitArgsSize()); @@ -3361,7 +3365,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. if (ImplArgs && - getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { + getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) { ImplArgs->BlockCountX = NumBlocks; ImplArgs->BlockCountY = 1; ImplArgs->BlockCountZ = 1; diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 0b6bc50ebf1d8..43be4e8edeba4 100644 --- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -23,7 +23,8 @@ namespace llvm { namespace omp { namespace target { namespace plugin { -namespace utils { +namespace hsa_utils { + // The implicit arguments of COV5 AMDGPU kernels. struct AMDGPUImplicitArgsTy { uint32_t BlockCountX; @@ -66,7 +67,7 @@ inline Error readAMDGPUMetaDataFromImage( return Err; } -} // namespace utils +} // namespace hsa_utils } // namespace plugin } // namespace target } // namespace omp diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 7e3e788fa52dc..41cc0f286a581 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -232,7 +232,7 @@ class DeviceImageTy { /// Get the image size. size_t getSize() const { - return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); + return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); } /// Get a memory buffer reference to the whole image. @@ -539,7 +539,7 @@ class PinnedAllocationMapTy { --It; // The buffer is not contained in the pinned allocation. - if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr) + if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr) return &(*It); // None found. @@ -566,15 +566,15 @@ class PinnedAllocationMapTy { /// Indicate whether the first range A fully contains the second range B. static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrB >= PtrA && EndB <= EndA); } /// Indicate whether the first range A intersects with the second range B. static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrA < EndB && PtrB < EndA); } @@ -656,8 +656,8 @@ class PinnedAllocationMapTy { if (!Entry) return nullptr; - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } /// Check whether a buffer belongs to a registered host pinned allocation. @@ -944,7 +944,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); for (auto &It : *AllocationTraceMap) { if (It.first <= DevicePtr && - advanceVoidPtr(It.first, It.second->Size) > DevicePtr) + utils::advancePtr(It.first, It.second->Size) > DevicePtr) return It.second; } return nullptr; diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 59719027f122a..5acc94458bbd8 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -153,8 +153,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, HostGlobal.getPtr()); assert(Image.getStart() <= ImageGlobal.getPtr() && - advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < - advanceVoidPtr(Image.getStart(), Image.getSize()) && + utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < + utils::advancePtr(Image.getStart(), Image.getSize()) && "Attempting to read outside the image!"); // Perform the copy from the image to the host memory. diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index 9dbba1459839d..9adb62b677b92 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -51,7 +51,7 @@ namespace { bool isImageBitcode(const __tgt_device_image &Image) { StringRef Binary(reinterpret_cast(Image.ImageStart), - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); return identify_magic(Binary) == file_magic::bitcode; } @@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr &MB, Expected> createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) { StringRef Data((const char *)Image.ImageStart, - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); std::unique_ptr MB = MemoryBuffer::getMemBuffer( Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false); return createModuleFromMemoryBuffer(MB, Context); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 60f7c918d7adb..25b815b7f9669 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -17,6 +17,7 @@ #include "ErrorReporting.h" #include "GlobalHandler.h" #include "JIT.h" +#include "Shared/Utils.h" #include "Utils/ELF.h" #include "omptarget.h" @@ -77,7 +78,7 @@ struct RecordReplayTy { Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); Device->free(Addr); // Align Address to MaxMemoryAllocation - Addr = (void *)alignPtr((Addr), MaxMemoryAllocation); + Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); return Addr; } @@ -210,8 +211,8 @@ struct RecordReplayTy { if (EC) report_fatal_error("Error saving image : " + StringRef(EC.message())); if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { - size_t Size = - getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart); + size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd, + TgtImageBitcode->ImageStart); MemoryBufferRef MBR = MemoryBufferRef( StringRef((const char *)TgtImageBitcode->ImageStart, Size), ""); OS << MBR.getBuffer(); @@ -244,10 +245,10 @@ struct RecordReplayTy { int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; memcpy(BufferPtr, OffloadEntry.Name, NameLength); - BufferPtr = advanceVoidPtr(BufferPtr, NameLength); + BufferPtr = utils::advancePtr(BufferPtr, NameLength); *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; - BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t)); auto Err = Plugin::success(); { @@ -257,11 +258,12 @@ struct RecordReplayTy { } if (Err) report_fatal_error("Error retrieving data for global"); - BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size); + BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size); } assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && "Buffer over/under-filled."); - assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && + assert(Size == utils::getPtrDiff(BufferPtr, + GlobalsMB->get()->getBufferStart()) && "Buffer size mismatch"); StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); @@ -931,7 +933,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, #ifdef OMPT_SUPPORT if (ompt::Initialized) { size_t Bytes = - getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); + utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); performOmptCallback( device_load, Plugin.getUserId(DeviceId), /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, @@ -1159,8 +1161,8 @@ Expected PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, return std::move(Err); // Return the device accessible pointer with the correct offset. - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } // No intersecting registered allocation found in the map. First, lock the @@ -1697,7 +1699,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; } int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); @@ -1729,7 +1731,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, __tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b6465d61bd033..015c7775ba351 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -705,7 +705,7 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::error("Wrong device Page size"); // Ceil to page size. - Size = roundUp(Size, Granularity); + Size = utils::roundUp(Size, Granularity); // Create a handler of our allocation CUmemGenericAllocationHandle AHandle; diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp index e42460b5cca4f..e5b4bf5526437 100644 --- a/offload/src/DeviceImage.cpp +++ b/offload/src/DeviceImage.cpp @@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc, __tgt_device_image &TgtDeviceImage) : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) { - llvm::StringRef ImageStr( - static_cast(Image.ImageStart), - llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + llvm::StringRef ImageStr(static_cast(Image.ImageStart), + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); auto BinaryOrErr = llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, "")); diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp index 7a2ee1303d68c..66137b53b0cb4 100644 --- a/offload/src/omptarget.cpp +++ b/offload/src/omptarget.cpp @@ -156,8 +156,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) { for (auto &Image : PM->deviceImages()) { const char *Start = reinterpret_cast( Image.getExecutableImage().ImageStart); - uint64_t Length = llvm::omp::target::getPtrDiff( - Start, Image.getExecutableImage().ImageEnd); + uint64_t Length = + utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), /*Identifier=*/""); From 84bf0da34dd020090e05816fcbda305d1f422c27 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 5 Sep 2024 13:37:21 -0700 Subject: [PATCH 292/425] [Attributor][FIX] Ensure to always translate call site arguments (#107323) When we propagate call site arguments we always need to translate them, this is important as we ended up picking the function argument for a recurisve call not the call site argument. `@recBad` and `@recGood` in `returned.ll` show the problem as they used to transform them the same way. The restructuring cleans the code up and helps derive more "returned" arguments and better information in the presence of recursive calls. The "dropped" attributes are simply dropped because we do not query them anymore, not because we cannot derive them. --- .../Transforms/IPO/AttributorAttributes.cpp | 103 +++++++--------- .../Attributor/IPConstantProp/PR26044.ll | 34 +++--- llvm/test/Transforms/Attributor/align.ll | 34 +++--- .../Transforms/Attributor/memory_locations.ll | 2 +- llvm/test/Transforms/Attributor/range.ll | 2 +- .../read_write_returned_arguments_scc.ll | 28 ++--- llvm/test/Transforms/Attributor/returned.ll | 112 ++++++++++++++++-- .../OpenMP/replace_globalization.ll | 4 +- 8 files changed, 197 insertions(+), 122 deletions(-) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 6b6d6d8d2a1e4..1fe8e6515fe0e 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -11516,9 +11516,21 @@ struct AAPotentialValuesReturned : public AAPotentialValuesFloating { return false; if (!AddValues) continue; - for (const AA::ValueAndContext &VAC : Values) + + bool AllInterAreIntra = false; + if (S == AA::Interprocedural) + AllInterAreIntra = + llvm::all_of(Values, [&](const AA::ValueAndContext &VAC) { + return AA::isValidInScope(*VAC.getValue(), AnchorScope); + }); + + for (const AA::ValueAndContext &VAC : Values) { addValue(A, getState(), *VAC.getValue(), - VAC.getCtxI() ? VAC.getCtxI() : CtxI, S, AnchorScope); + VAC.getCtxI() ? VAC.getCtxI() : CtxI, + AllInterAreIntra ? AA::AnyScope : S, AnchorScope); + } + if (AllInterAreIntra) + break; } return true; }; @@ -11547,16 +11559,6 @@ struct AAPotentialValuesReturned : public AAPotentialValuesFloating { : ChangeStatus::CHANGED; } - void addValue(Attributor &A, StateType &State, Value &V, - const Instruction *CtxI, AA::ValueScope S, - Function *AnchorScope) const override { - Function *F = getAssociatedFunction(); - if (auto *CB = dyn_cast(&V)) - if (CB->getCalledOperand() == F) - return; - Base::addValue(A, State, V, CtxI, S, AnchorScope); - } - ChangeStatus manifest(Attributor &A) override { if (ReturnedArg) return ChangeStatus::UNCHANGED; @@ -11651,64 +11653,39 @@ struct AAPotentialValuesCallSiteReturned : AAPotentialValuesImpl { UsedAssumedInformation)) return indicatePessimisticFixpoint(); - SmallVector Values; - if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this, - Values, AA::Intraprocedural, - UsedAssumedInformation)) - return indicatePessimisticFixpoint(); - Function *Caller = CB->getCaller(); - bool AnyNonLocal = false; - for (auto &It : Values) { - Value *V = It.getValue(); - std::optional CallerV = A.translateArgumentToCallSiteContent( - V, *CB, *this, UsedAssumedInformation); - if (!CallerV.has_value()) { - // Nothing to do as long as no value was determined. - continue; - } - V = *CallerV ? *CallerV : V; - if (AA::isDynamicallyUnique(A, *this, *V) && - AA::isValidInScope(*V, Caller)) { - if (*CallerV) { - SmallVector ArgValues; - IRPosition IRP = IRPosition::value(*V); - if (auto *Arg = dyn_cast(V)) - if (Arg->getParent() == CB->getCalledOperand()) - IRP = IRPosition::callsite_argument(*CB, Arg->getArgNo()); - if (recurseForValue(A, IRP, AA::AnyScope)) - continue; - } - addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope()); - } else { - AnyNonLocal = true; - break; - } - } - if (AnyNonLocal) { - Values.clear(); + auto AddScope = [&](AA::ValueScope S) { + SmallVector Values; if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this, - Values, AA::Interprocedural, - UsedAssumedInformation)) - return indicatePessimisticFixpoint(); - AnyNonLocal = false; - getState() = PotentialLLVMValuesState::getBestState(); + Values, S, UsedAssumedInformation)) + return false; + for (auto &It : Values) { Value *V = It.getValue(); - if (!AA::isDynamicallyUnique(A, *this, *V)) - return indicatePessimisticFixpoint(); - if (AA::isValidInScope(*V, Caller)) { - addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope()); - } else { - AnyNonLocal = true; - addValue(A, getState(), *V, CB, AA::Interprocedural, - getAnchorScope()); + std::optional CallerV = A.translateArgumentToCallSiteContent( + V, *CB, *this, UsedAssumedInformation); + if (!CallerV.has_value()) { + // Nothing to do as long as no value was determined. + continue; + } + V = *CallerV ? *CallerV : V; + if (*CallerV && AA::isDynamicallyUnique(A, *this, *V)) { + if (recurseForValue(A, IRPosition::value(*V), S)) + continue; + } + if (S == AA::Intraprocedural && !AA::isValidInScope(*V, Caller)) { + giveUpOnIntraprocedural(A); + return true; } + addValue(A, getState(), *V, CB, S, getAnchorScope()); } - if (AnyNonLocal) - giveUpOnIntraprocedural(A); - } + return true; + }; + if (!AddScope(AA::Intraprocedural)) + return indicatePessimisticFixpoint(); + if (!AddScope(AA::Interprocedural)) + return indicatePessimisticFixpoint(); return (AssumedBefore == getAssumed()) ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll index 35d0eeac50d51..f673ffc3bfac6 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll @@ -16,8 +16,7 @@ define void @fn2(ptr %P, i1 %C) { ; TUNIT: if.end: ; TUNIT-NEXT: [[E_2:%.*]] = phi ptr [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ] ; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[E_2]], align 4 -; TUNIT-NEXT: [[CALL:%.*]] = call i32 @fn1(i32 [[TMP0]]) #[[ATTR3:[0-9]+]] -; TUNIT-NEXT: store i32 [[CALL]], ptr [[P]], align 4 +; TUNIT-NEXT: store i32 [[TMP0]], ptr [[P]], align 4 ; TUNIT-NEXT: br label [[FOR_COND1]] ; TUNIT: exit: ; TUNIT-NEXT: ret void @@ -55,11 +54,11 @@ exit: } define internal i32 @fn1(i32 %p1) { -; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; CHECK-LABEL: define {{[^@]+}}@fn1 -; CHECK-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret i32 [[P1]] +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@fn1 +; CGSCC-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1:[0-9]+]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: ret i32 [[P1]] ; entry: %tobool = icmp ne i32 %p1, 0 @@ -71,7 +70,7 @@ define void @fn_no_null_opt(ptr %P, i1 %C) null_pointer_is_valid { ; ; TUNIT: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid ; TUNIT-LABEL: define {{[^@]+}}@fn_no_null_opt -; TUNIT-SAME: (ptr nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] { +; TUNIT-SAME: (ptr nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: br label [[IF_END:%.*]] ; TUNIT: for.cond1: @@ -79,8 +78,7 @@ define void @fn_no_null_opt(ptr %P, i1 %C) null_pointer_is_valid { ; TUNIT: if.end: ; TUNIT-NEXT: [[E_2:%.*]] = phi ptr [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ] ; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr null, align 4294967296 -; TUNIT-NEXT: [[CALL:%.*]] = call i32 @fn0(i32 [[TMP0]]) #[[ATTR3]] -; TUNIT-NEXT: store i32 [[CALL]], ptr [[P]], align 4 +; TUNIT-NEXT: store i32 [[TMP0]], ptr [[P]], align 4 ; TUNIT-NEXT: br label [[FOR_COND1]] ; TUNIT: exit: ; TUNIT-NEXT: ret void @@ -118,11 +116,11 @@ exit: } define internal i32 @fn0(i32 %p1) { -; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; CHECK-LABEL: define {{[^@]+}}@fn0 -; CHECK-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret i32 [[P1]] +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@fn0 +; CGSCC-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: ret i32 [[P1]] ; entry: %tobool = icmp ne i32 %p1, 0 @@ -131,12 +129,12 @@ entry: } ;. ; TUNIT: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind memory(argmem: readwrite) } -; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } -; TUNIT: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind null_pointer_is_valid } -; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind memory(none) } +; TUNIT: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind null_pointer_is_valid } ;. ; CGSCC: attributes #[[ATTR0]] = { nofree nosync nounwind memory(argmem: readwrite) } ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; CGSCC: attributes #[[ATTR2]] = { nofree nosync nounwind null_pointer_is_valid } ; CGSCC: attributes #[[ATTR3]] = { nofree nosync } ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll index 9880e53fd43a5..0a0cd415a2ab7 100644 --- a/llvm/test/Transforms/Attributor/align.ll +++ b/llvm/test/Transforms/Attributor/align.ll @@ -416,14 +416,14 @@ define void @test9_traversal(i1 %cnd, ptr align 4 %B, ptr align 8 %C) { ; FIXME: This will work with an upcoming patch (D66618 or similar) ; store i32 -1, ptr %g1, align 32 define ptr @test10a(ptr align 32 %p) { -; TUNIT: Function Attrs: nofree nosync nounwind +; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; TUNIT-LABEL: define {{[^@]+}}@test10a ; TUNIT-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR3:[0-9]+]] { ; TUNIT-NEXT: [[L:%.*]] = load i32, ptr [[P]], align 32 ; TUNIT-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR3]] +; TUNIT-NEXT: [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR13:[0-9]+]] ; TUNIT-NEXT: store i32 1, ptr [[R]], align 32 ; TUNIT-NEXT: [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8 ; TUNIT-NEXT: br label [[E:%.*]] @@ -435,14 +435,14 @@ define ptr @test10a(ptr align 32 %p) { ; TUNIT-NEXT: [[PHI:%.*]] = phi ptr [ [[G0]], [[T]] ], [ [[G1]], [[F]] ] ; TUNIT-NEXT: ret ptr [[PHI]] ; -; CGSCC: Function Attrs: nofree nosync nounwind +; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; CGSCC-LABEL: define {{[^@]+}}@test10a ; CGSCC-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4:[0-9]+]] { ; CGSCC-NEXT: [[L:%.*]] = load i32, ptr [[P]], align 32 ; CGSCC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR4]] +; CGSCC-NEXT: [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR16:[0-9]+]] ; CGSCC-NEXT: store i32 1, ptr [[R]], align 32 ; CGSCC-NEXT: [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8 ; CGSCC-NEXT: br label [[E:%.*]] @@ -478,14 +478,14 @@ e: ; FIXME: This will work with an upcoming patch (D66618 or similar) ; store i32 -1, ptr %g1, align 32 define ptr @test10b(ptr align 32 %p) { -; TUNIT: Function Attrs: nofree nosync nounwind +; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; TUNIT-LABEL: define {{[^@]+}}@test10b ; TUNIT-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR3]] { ; TUNIT-NEXT: [[L:%.*]] = load i32, ptr [[P]], align 32 ; TUNIT-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR3]] +; TUNIT-NEXT: [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR13]] ; TUNIT-NEXT: store i32 1, ptr [[R]], align 32 ; TUNIT-NEXT: [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8 ; TUNIT-NEXT: br label [[E:%.*]] @@ -497,14 +497,14 @@ define ptr @test10b(ptr align 32 %p) { ; TUNIT-NEXT: [[PHI:%.*]] = phi ptr [ [[G0]], [[T]] ], [ [[G1]], [[F]] ] ; TUNIT-NEXT: ret ptr [[PHI]] ; -; CGSCC: Function Attrs: nofree nosync nounwind +; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; CGSCC-LABEL: define {{[^@]+}}@test10b ; CGSCC-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] { ; CGSCC-NEXT: [[L:%.*]] = load i32, ptr [[P]], align 32 ; CGSCC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR4]] +; CGSCC-NEXT: [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR16]] ; CGSCC-NEXT: store i32 1, ptr [[R]], align 32 ; CGSCC-NEXT: [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8 ; CGSCC-NEXT: br label [[E:%.*]] @@ -946,7 +946,7 @@ define i32 @musttail_caller_1(ptr %p) { ; TUNIT-NEXT: [[C:%.*]] = load i1, ptr @cnd, align 1 ; TUNIT-NEXT: br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]] ; TUNIT: mt: -; TUNIT-NEXT: [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR13:[0-9]+]] +; TUNIT-NEXT: [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR14:[0-9]+]] ; TUNIT-NEXT: ret i32 [[V]] ; TUNIT: exit: ; TUNIT-NEXT: ret i32 0 @@ -957,7 +957,7 @@ define i32 @musttail_caller_1(ptr %p) { ; CGSCC-NEXT: [[C:%.*]] = load i1, ptr @cnd, align 1 ; CGSCC-NEXT: br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]] ; CGSCC: mt: -; CGSCC-NEXT: [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR16:[0-9]+]] +; CGSCC-NEXT: [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR17:[0-9]+]] ; CGSCC-NEXT: ret i32 [[V]] ; CGSCC: exit: ; CGSCC-NEXT: ret i32 0 @@ -1089,7 +1089,7 @@ define ptr @aligned_8_return_caller(ptr align(16) %a, i1 %c1, i1 %c2) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@aligned_8_return_caller ; TUNIT-SAME: (ptr nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR10]] { -; TUNIT-NEXT: [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR14:[0-9]+]] +; TUNIT-NEXT: [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR15:[0-9]+]] ; TUNIT-NEXT: ret ptr [[R]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) @@ -1221,7 +1221,7 @@ attributes #2 = { null_pointer_is_valid } ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR2]] = { nounwind } -; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind } +; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind memory(argmem: readwrite) } ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) } ; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } ; TUNIT: attributes #[[ATTR6]] = { nounwind willreturn } @@ -1231,14 +1231,15 @@ attributes #2 = { null_pointer_is_valid } ; TUNIT: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; TUNIT: attributes #[[ATTR11]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) } ; TUNIT: attributes #[[ATTR12]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) } -; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind willreturn memory(read) } -; TUNIT: attributes #[[ATTR14]] = { nofree nosync nounwind willreturn } +; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind } +; TUNIT: attributes #[[ATTR14]] = { nofree nosync nounwind willreturn memory(read) } +; TUNIT: attributes #[[ATTR15]] = { nofree nosync nounwind willreturn } ;. ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR2]] = { noinline nounwind uwtable } ; CGSCC: attributes #[[ATTR3]] = { nounwind } -; CGSCC: attributes #[[ATTR4]] = { nofree nosync nounwind } +; CGSCC: attributes #[[ATTR4]] = { nofree nosync nounwind memory(argmem: readwrite) } ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) } ; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } ; CGSCC: attributes #[[ATTR7]] = { nounwind willreturn } @@ -1250,5 +1251,6 @@ attributes #2 = { null_pointer_is_valid } ; CGSCC: attributes #[[ATTR13]] = { mustprogress nofree nosync nounwind willreturn memory(none) } ; CGSCC: attributes #[[ATTR14]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) } ; CGSCC: attributes #[[ATTR15]] = { nofree nosync willreturn } -; CGSCC: attributes #[[ATTR16]] = { nofree willreturn memory(read) } +; CGSCC: attributes #[[ATTR16]] = { nofree nosync nounwind } +; CGSCC: attributes #[[ATTR17]] = { nofree willreturn memory(read) } ;. diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll index 2dbdf9e6048c0..a7d3fba9cf9b8 100644 --- a/llvm/test/Transforms/Attributor/memory_locations.ll +++ b/llvm/test/Transforms/Attributor/memory_locations.ll @@ -35,7 +35,7 @@ define dso_local ptr @internal_only_rec(i32 %arg) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[ARG]], 2 -; CHECK-NEXT: [[CALL:%.*]] = call noalias ptr @internal_only_rec(i32 [[DIV]]) +; CHECK-NEXT: [[CALL:%.*]] = call ptr @internal_only_rec(i32 [[DIV]]) ; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[ARG]] to i64 diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll index 9b2f9ed2dde91..48040fec772dc 100644 --- a/llvm/test/Transforms/Attributor/range.ll +++ b/llvm/test/Transforms/Attributor/range.ll @@ -48,7 +48,7 @@ define void @test0-icmp-check(ptr %p){ ; ret = [0, 10) ; TUNIT-LABEL: define {{[^@]+}}@test0-icmp-check ; TUNIT-SAME: (ptr nocapture nofree readonly align 4 [[P:%.*]]) { -; TUNIT-NEXT: [[RET:%.*]] = tail call i32 @test0(ptr nocapture nofree noundef readonly align 4 [[P]]) #[[ATTR3]], !range [[RNG0]] +; TUNIT-NEXT: [[RET:%.*]] = tail call i32 @test0(ptr nocapture nofree noundef readonly align 4 [[P]]) #[[ATTR3]] ; TUNIT-NEXT: [[CMP_EQ_1:%.*]] = icmp eq i32 [[RET]], 10 ; TUNIT-NEXT: [[CMP_EQ_2:%.*]] = icmp eq i32 [[RET]], 9 ; TUNIT-NEXT: [[CMP_EQ_3:%.*]] = icmp eq i32 [[RET]], 8 diff --git a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll index 81530e9164899..775c949ca8939 100644 --- a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll +++ b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll @@ -64,7 +64,7 @@ entry: define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; TUNIT-LABEL: define {{[^@]+}}@internal_ret0_nw -; TUNIT-SAME: (ptr nofree [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { +; TUNIT-SAME: (ptr nofree returned [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: [[R0:%.*]] = alloca i32, align 4 ; TUNIT-NEXT: [[R1:%.*]] = alloca i32, align 4 @@ -84,12 +84,12 @@ define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) { ; TUNIT-NEXT: [[CALL5:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull [[N0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]] ; TUNIT-NEXT: br label [[RETURN]] ; TUNIT: return: -; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL5]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ] -; TUNIT-NEXT: ret ptr [[RETVAL_0]] +; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[N0]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ] +; TUNIT-NEXT: ret ptr [[N0]] ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; CGSCC-LABEL: define {{[^@]+}}@internal_ret0_nw -; CGSCC-SAME: (ptr nofree [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { +; CGSCC-SAME: (ptr nofree returned [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: [[R0:%.*]] = alloca i32, align 4 ; CGSCC-NEXT: [[R1:%.*]] = alloca i32, align 4 @@ -109,8 +109,8 @@ define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) { ; CGSCC-NEXT: [[CALL5:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull [[N0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]] ; CGSCC-NEXT: br label [[RETURN]] ; CGSCC: return: -; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL5]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ] -; CGSCC-NEXT: ret ptr [[RETVAL_0]] +; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[N0]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ] +; CGSCC-NEXT: ret ptr [[N0]] ; entry: %r0 = alloca i32, align 4 @@ -164,7 +164,7 @@ define internal ptr @internal_ret1_rrw(ptr %r0, ptr %r1, ptr %w0) { ; TUNIT-NEXT: [[CALL8:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull align 4 dereferenceable(4) [[R1]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]] ; TUNIT-NEXT: br label [[RETURN]] ; TUNIT: return: -; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL8]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ] +; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[R1]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ] ; TUNIT-NEXT: ret ptr undef ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) @@ -192,7 +192,7 @@ define internal ptr @internal_ret1_rrw(ptr %r0, ptr %r1, ptr %w0) { ; CGSCC-NEXT: [[CALL8:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull align 4 dereferenceable(4) [[R1]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]] ; CGSCC-NEXT: br label [[RETURN]] ; CGSCC: return: -; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL8]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ] +; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[R1]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ] ; CGSCC-NEXT: ret ptr undef ; entry: @@ -259,7 +259,7 @@ return: ; preds = %if.end, %if.then define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; TUNIT-LABEL: define {{[^@]+}}@internal_ret1_rw -; TUNIT-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { +; TUNIT-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree returned [[W0:%.*]]) #[[ATTR0]] { ; TUNIT-NEXT: entry: ; TUNIT-NEXT: [[TMP0:%.*]] = load i32, ptr [[R0]], align 4 ; TUNIT-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 @@ -276,12 +276,12 @@ define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) { ; TUNIT-NEXT: [[CALL4:%.*]] = call ptr @external_ret2_nrw(ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]] ; TUNIT-NEXT: br label [[RETURN]] ; TUNIT: return: -; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL4]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ] -; TUNIT-NEXT: ret ptr [[RETVAL_0]] +; TUNIT-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[W0]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ] +; TUNIT-NEXT: ret ptr [[W0]] ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite) ; CGSCC-LABEL: define {{[^@]+}}@internal_ret1_rw -; CGSCC-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] { +; CGSCC-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree returned [[W0:%.*]]) #[[ATTR0]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: [[TMP0:%.*]] = load i32, ptr [[R0]], align 4 ; CGSCC-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 @@ -298,8 +298,8 @@ define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) { ; CGSCC-NEXT: [[CALL4:%.*]] = call ptr @external_ret2_nrw(ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]] ; CGSCC-NEXT: br label [[RETURN]] ; CGSCC: return: -; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[CALL4]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ] -; CGSCC-NEXT: ret ptr [[RETVAL_0]] +; CGSCC-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[W0]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ] +; CGSCC-NEXT: ret ptr [[W0]] ; entry: %0 = load i32, ptr %r0, align 4 diff --git a/llvm/test/Transforms/Attributor/returned.ll b/llvm/test/Transforms/Attributor/returned.ll index e94cb95069694..bc55a50f0e6f7 100644 --- a/llvm/test/Transforms/Attributor/returned.ll +++ b/llvm/test/Transforms/Attributor/returned.ll @@ -483,7 +483,7 @@ define ptr @rt0(ptr %a) #0 { ; TUNIT-LABEL: define {{[^@]+}}@rt0 ; TUNIT-SAME: (ptr nofree noundef nonnull readonly returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[A:%.*]]) #[[ATTR3:[0-9]+]] { ; TUNIT-NEXT: entry: -; TUNIT-NEXT: [[CALL:%.*]] = call ptr @rt0(ptr nofree noundef nonnull readonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR9:[0-9]+]] +; TUNIT-NEXT: [[CALL:%.*]] = call ptr @rt0(ptr nofree noundef nonnull readonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR10:[0-9]+]] ; TUNIT-NEXT: ret ptr [[A]] ; ; CGSCC: Function Attrs: nofree noinline nosync nounwind memory(argmem: read) uwtable @@ -667,7 +667,7 @@ define ptr @calls_unknown_fn(ptr %r) #0 { ; TUNIT: Function Attrs: noinline nounwind uwtable ; TUNIT-LABEL: define {{[^@]+}}@calls_unknown_fn ; TUNIT-SAME: (ptr nofree readnone returned "no-capture-maybe-returned" [[R:%.*]]) #[[ATTR5:[0-9]+]] { -; TUNIT-NEXT: tail call void @unknown_fn(ptr noundef nonnull @calls_unknown_fn) #[[ATTR10:[0-9]+]] +; TUNIT-NEXT: tail call void @unknown_fn(ptr noundef nonnull @calls_unknown_fn) #[[ATTR11:[0-9]+]] ; TUNIT-NEXT: ret ptr [[R]] ; ; CGSCC: Function Attrs: noinline nounwind uwtable @@ -716,7 +716,7 @@ define ptr @calls_maybe_redefined_fn(ptr %r) #0 { ; TUNIT-LABEL: define {{[^@]+}}@calls_maybe_redefined_fn ; TUNIT-SAME: (ptr returned [[R:%.*]]) #[[ATTR5]] { ; TUNIT-NEXT: entry: -; TUNIT-NEXT: [[CALL:%.*]] = call ptr @maybe_redefined_fn(ptr [[R]]) #[[ATTR10]] +; TUNIT-NEXT: [[CALL:%.*]] = call ptr @maybe_redefined_fn(ptr [[R]]) #[[ATTR11]] ; TUNIT-NEXT: ret ptr [[R]] ; ; CGSCC: Function Attrs: noinline nounwind uwtable @@ -765,7 +765,7 @@ define ptr @calls_maybe_redefined_fn2(ptr %r) #0 { ; TUNIT-LABEL: define {{[^@]+}}@calls_maybe_redefined_fn2 ; TUNIT-SAME: (ptr [[R:%.*]]) #[[ATTR5]] { ; TUNIT-NEXT: entry: -; TUNIT-NEXT: [[CALL:%.*]] = call ptr @maybe_redefined_fn2(ptr [[R]]) #[[ATTR10]] +; TUNIT-NEXT: [[CALL:%.*]] = call ptr @maybe_redefined_fn2(ptr [[R]]) #[[ATTR11]] ; TUNIT-NEXT: ret ptr [[CALL]] ; ; CGSCC: Function Attrs: noinline nounwind uwtable @@ -1451,6 +1451,103 @@ declare dso_local ptr @__dynamic_cast(ptr, ptr, ptr, i64) ; UTC_ARGS: --enable +; This does not return %arg, @recGood does. + +define internal i32 @recBad(i1 %c, i32 %arg) { +; TUNIT: Function Attrs: nofree nosync nounwind memory(none) +; TUNIT-LABEL: define {{[^@]+}}@recBad +; TUNIT-SAME: (i1 noundef [[C:%.*]], i32 [[ARG:%.*]]) #[[ATTR8]] { +; TUNIT-NEXT: [[ADD:%.*]] = add i32 [[ARG]], 1 +; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; TUNIT: t: +; TUNIT-NEXT: [[R:%.*]] = call i32 @recBad(i1 noundef false, i32 [[ADD]]) #[[ATTR8]] +; TUNIT-NEXT: ret i32 [[ADD]] +; TUNIT: f: +; TUNIT-NEXT: ret i32 [[ARG]] +; +; CGSCC: Function Attrs: nofree nosync nounwind memory(none) +; CGSCC-LABEL: define {{[^@]+}}@recBad +; CGSCC-SAME: (i1 noundef [[C:%.*]], i32 [[ARG:%.*]]) #[[ATTR7]] { +; CGSCC-NEXT: [[ADD:%.*]] = add i32 [[ARG]], 1 +; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; CGSCC: t: +; CGSCC-NEXT: [[R:%.*]] = call i32 @recBad(i1 noundef false, i32 [[ADD]]) #[[ATTR7]] +; CGSCC-NEXT: ret i32 [[ADD]] +; CGSCC: f: +; CGSCC-NEXT: ret i32 [[ARG]] +; + %add = add i32 %arg, 1 + br i1 %c, label %t, label %f +t: + %r = call i32 @recBad(i1 false, i32 %add) + ret i32 %r +f: + ret i32 %arg +} + +define i32 @recBadCaller(i1 %cQ, i32 %argQ) { +; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none) +; TUNIT-LABEL: define {{[^@]+}}@recBadCaller +; TUNIT-SAME: (i1 [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR9:[0-9]+]] { +; TUNIT-NEXT: [[RQ:%.*]] = call i32 @recBad(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR8]] +; TUNIT-NEXT: ret i32 [[RQ]] +; +; CGSCC: Function Attrs: nofree nosync nounwind memory(none) +; CGSCC-LABEL: define {{[^@]+}}@recBadCaller +; CGSCC-SAME: (i1 noundef [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR7]] { +; CGSCC-NEXT: [[RQ:%.*]] = call i32 @recBad(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR7]] +; CGSCC-NEXT: ret i32 [[RQ]] +; + %rQ = call i32 @recBad(i1 %cQ, i32 %argQ) + ret i32 %rQ +} + +define internal i32 @recGood(i1 %c, i32 %arg) { +; TUNIT: Function Attrs: nofree nosync nounwind memory(none) +; TUNIT-LABEL: define {{[^@]+}}@recGood +; TUNIT-SAME: (i1 noundef [[C:%.*]], i32 returned [[ARG:%.*]]) #[[ATTR8]] { +; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; TUNIT: t: +; TUNIT-NEXT: [[R:%.*]] = call i32 @recGood(i1 noundef false, i32 [[ARG]]) #[[ATTR8]] +; TUNIT-NEXT: ret i32 [[ARG]] +; TUNIT: f: +; TUNIT-NEXT: ret i32 [[ARG]] +; +; CGSCC: Function Attrs: nofree nosync nounwind memory(none) +; CGSCC-LABEL: define {{[^@]+}}@recGood +; CGSCC-SAME: (i1 noundef [[C:%.*]], i32 returned [[ARG:%.*]]) #[[ATTR7]] { +; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] +; CGSCC: t: +; CGSCC-NEXT: [[R:%.*]] = call i32 @recGood(i1 noundef false, i32 [[ARG]]) #[[ATTR7]] +; CGSCC-NEXT: ret i32 [[ARG]] +; CGSCC: f: +; CGSCC-NEXT: ret i32 [[ARG]] +; + br i1 %c, label %t, label %f +t: + %r = call i32 @recGood(i1 false, i32 %arg) + ret i32 %r +f: + ret i32 %arg +} + +define i32 @recGoodCaller(i1 %cQ, i32 %argQ) { +; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none) +; TUNIT-LABEL: define {{[^@]+}}@recGoodCaller +; TUNIT-SAME: (i1 [[CQ:%.*]], i32 returned [[ARGQ:%.*]]) #[[ATTR9]] { +; TUNIT-NEXT: [[RQ:%.*]] = call i32 @recGood(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR8]] +; TUNIT-NEXT: ret i32 [[ARGQ]] +; +; CGSCC: Function Attrs: nofree nosync nounwind memory(none) +; CGSCC-LABEL: define {{[^@]+}}@recGoodCaller +; CGSCC-SAME: (i1 noundef [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR7]] { +; CGSCC-NEXT: [[RQ:%.*]] = call i32 @recGood(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR7]] +; CGSCC-NEXT: ret i32 [[RQ]] +; + %rQ = call i32 @recGood(i1 %cQ, i32 %argQ) + ret i32 %rQ +} + attributes #0 = { noinline nounwind uwtable } ;. ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } @@ -1462,9 +1559,10 @@ attributes #0 = { noinline nounwind uwtable } ; TUNIT: attributes #[[ATTR6]] = { noreturn } ; TUNIT: attributes #[[ATTR7:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; TUNIT: attributes #[[ATTR8]] = { nofree nosync nounwind memory(none) } -; TUNIT: attributes #[[ATTR9]] = { nofree nosync nounwind memory(read) } -; TUNIT: attributes #[[ATTR10]] = { nounwind } -; TUNIT: attributes #[[ATTR11:[0-9]+]] = { nounwind memory(none) } +; TUNIT: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind memory(none) } +; TUNIT: attributes #[[ATTR10]] = { nofree nosync nounwind memory(read) } +; TUNIT: attributes #[[ATTR11]] = { nounwind } +; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nounwind memory(none) } ;. ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR1]] = { nofree noinline nosync nounwind memory(none) uwtable } diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll index 0f89b428de7bf..6e4fb9e57388b 100644 --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -201,7 +201,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ; CHECK-NEXT: ret void ; ; -; CHECK: Function Attrs: norecurse nosync nounwind allocsize(0) memory(read) +; CHECK: Function Attrs: nosync nounwind allocsize(0) memory(read) ; CHECK-LABEL: define {{[^@]+}}@__kmpc_alloc_shared ; CHECK-SAME: (i64 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr @offset, align 4 @@ -216,7 +216,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ;. ; CHECK: attributes #[[ATTR0]] = { "kernel" } ; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) } -; CHECK: attributes #[[ATTR2]] = { norecurse nosync nounwind allocsize(0) memory(read) } +; CHECK: attributes #[[ATTR2]] = { nosync nounwind allocsize(0) memory(read) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind } ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR5]] = { "llvm.assume"="omp_no_openmp" } From 52dca6ffae08fcd86cff32ab469870016a6aceb5 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 5 Sep 2024 13:38:39 -0700 Subject: [PATCH 293/425] [SandboxVec] Boilerplate (#107431) This patch implements the new pass and registers it with the pass manager. For context, this is a vectorizer that operates on Sandbox IR, which is a transactional IR on top of LLVM IR. --- .../SandboxVectorizer/SandboxVectorizer.h | 31 +++++++++++++++ llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Transforms/Vectorize/CMakeLists.txt | 3 ++ .../SandboxVectorizer/SandboxVectorizer.cpp | 39 +++++++++++++++++++ .../SandboxVectorizer/boilerplate.ll | 11 ++++++ 6 files changed, 86 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp create mode 100644 llvm/test/Transforms/SandboxVectorizer/boilerplate.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h new file mode 100644 index 0000000000000..0d4dbae44521e --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h @@ -0,0 +1,31 @@ +//===- SandboxVectorizer.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/PassManager.h" +#include "llvm/SandboxIR/SandboxIR.h" + +namespace llvm { + +class SandboxVectorizerPass : public PassInfoMixin { + TargetTransformInfo *TTI = nullptr; + +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + bool runImpl(Function &F); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index a22abed8051a1..7c0acc0745eba 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -321,6 +321,7 @@ #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" #include diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index d6067089c6b5c..a1324e8170566 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -453,6 +453,7 @@ FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass()) FUNCTION_PASS("reg2mem", RegToMemPass()) FUNCTION_PASS("safe-stack", SafeStackPass(TM)) +FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass()) FUNCTION_PASS("scalarizer", ScalarizerPass()) FUNCTION_PASS("sccp", SCCPPass()) diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 5c88d94d96622..649faad48d71e 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize LoopIdiomVectorize.cpp LoopVectorizationLegality.cpp LoopVectorize.cpp + SandboxVectorizer/SandboxVectorizer.cpp SLPVectorizer.cpp Vectorize.cpp VectorCombine.cpp @@ -18,6 +19,7 @@ add_llvm_component_library(LLVMVectorize ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Vectorize + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Vectorize/SandboxVectorizer DEPENDS intrinsics_gen @@ -27,4 +29,5 @@ add_llvm_component_library(LLVMVectorize Core Support TransformUtils + SandboxIR ) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp new file mode 100644 index 0000000000000..072a6606694a0 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -0,0 +1,39 @@ +//===- SandboxVectorizer.cpp - Vectorizer based on Sandbox IR -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define SV_NAME "sandbox-vectorizer" +#define DEBUG_TYPE SV_NAME + +PreservedAnalyses SandboxVectorizerPass::run(Function &F, + FunctionAnalysisManager &AM) { + TTI = &AM.getResult(F); + + bool Changed = runImpl(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} + +bool SandboxVectorizerPass::runImpl(Function &F) { + LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n"); + sandboxir::Context Ctx(F.getContext()); + // Create SandboxIR for `F`. + sandboxir::Function &SBF = *Ctx.createFunction(&F); + // TODO: Initialize SBVec Pass Manager + (void)SBF; + + return false; +} diff --git a/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll b/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll new file mode 100644 index 0000000000000..353659d41485f --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll @@ -0,0 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sandbox-vectorizer %s -S | FileCheck %s + +; This test checks that the pass was registered with the pass manager. +; TODO: Remove this test once actual tests land. +define void @boilerplate() { +; CHECK-LABEL: define void @boilerplate() { +; CHECK-NEXT: ret void +; + ret void +} From 4634a480e0e5aa3116b397369fe3877a8dfe4dc0 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Thu, 5 Sep 2024 13:43:34 -0700 Subject: [PATCH 294/425] [scudo] Add a method to use a hard-coded page size (#106646) Currently, only Android supports using a hard-code page size. Make this a bit more generic so any platform that wants to can use this. In addition, add a getPageSizeLogCached() function since this value is used in release.h and can avoid keeping this value around in objects. Finally, change some of the release.h page size multiplies to shifts using the new page size log value. --- compiler-rt/lib/scudo/standalone/combined.h | 3 +++ compiler-rt/lib/scudo/standalone/common.cpp | 10 ++++++- compiler-rt/lib/scudo/standalone/common.h | 30 ++++++++++++++++++--- compiler-rt/lib/scudo/standalone/linux.cpp | 3 +++ compiler-rt/lib/scudo/standalone/platform.h | 5 ++++ compiler-rt/lib/scudo/standalone/release.h | 30 ++++++++++++--------- 6 files changed, 63 insertions(+), 18 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 9c26282e6f860..a5f1bc388e882 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -140,6 +140,9 @@ class Allocator { typedef typename QuarantineT::CacheT QuarantineCacheT; void init() { + // Make sure that the page size is initialized if it's not a constant. + CHECK_NE(getPageSizeCached(), 0U); + performSanityChecks(); // Check if hardware CRC32 is supported in the binary and by the platform, diff --git a/compiler-rt/lib/scudo/standalone/common.cpp b/compiler-rt/lib/scudo/standalone/common.cpp index 06e930638f6f9..80134c39e757d 100644 --- a/compiler-rt/lib/scudo/standalone/common.cpp +++ b/compiler-rt/lib/scudo/standalone/common.cpp @@ -12,13 +12,21 @@ namespace scudo { -uptr PageSizeCached; +#if !defined(SCUDO_PAGE_SIZE) +uptr PageSizeCached = 0; +uptr PageSizeLogCached = 0; + +// Must be defined in platform specific code. uptr getPageSize(); +// This must be called in the init path or there could be a race if multiple +// threads try to set the cached values. uptr getPageSizeSlow() { PageSizeCached = getPageSize(); CHECK_NE(PageSizeCached, 0); + PageSizeLogCached = getLog2(PageSizeCached); return PageSizeCached; } +#endif } // namespace scudo diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h index 151fbd317e74d..e5dfda2e9072a 100644 --- a/compiler-rt/lib/scudo/standalone/common.h +++ b/compiler-rt/lib/scudo/standalone/common.h @@ -133,18 +133,40 @@ inline void computePercentage(uptr Numerator, uptr Denominator, uptr *Integral, // Platform specific functions. +#if defined(SCUDO_PAGE_SIZE) + +inline constexpr uptr getPageSizeCached() { return SCUDO_PAGE_SIZE; } + +inline constexpr uptr getPageSizeSlow() { return getPageSizeCached(); } + +inline constexpr uptr getPageSizeLogCached() { + return static_cast(__builtin_ctzl(SCUDO_PAGE_SIZE)); +} + +#else + extern uptr PageSizeCached; +extern uptr PageSizeLogCached; + uptr getPageSizeSlow(); + inline uptr getPageSizeCached() { -#if SCUDO_ANDROID && defined(PAGE_SIZE) - // Most Android builds have a build-time constant page size. - return PAGE_SIZE; -#endif if (LIKELY(PageSizeCached)) return PageSizeCached; return getPageSizeSlow(); } +inline uptr getPageSizeLogCached() { + if (LIKELY(PageSizeLogCached)) + return PageSizeLogCached; + // PageSizeLogCached and PageSizeCached are both set in getPageSizeSlow() + getPageSizeSlow(); + DCHECK_NE(PageSizeLogCached, 0); + return PageSizeLogCached; +} + +#endif + // Returns 0 if the number of CPUs could not be determined. u32 getNumberOfCPUs(); diff --git a/compiler-rt/lib/scudo/standalone/linux.cpp b/compiler-rt/lib/scudo/standalone/linux.cpp index 2746951081098..6cc8e0c786e06 100644 --- a/compiler-rt/lib/scudo/standalone/linux.cpp +++ b/compiler-rt/lib/scudo/standalone/linux.cpp @@ -40,7 +40,10 @@ namespace scudo { +#if !defined(SCUDO_PAGE_SIZE) +// This function is only used when page size is not hard-coded. uptr getPageSize() { return static_cast(sysconf(_SC_PAGESIZE)); } +#endif void NORETURN die() { abort(); } diff --git a/compiler-rt/lib/scudo/standalone/platform.h b/compiler-rt/lib/scudo/standalone/platform.h index 5af1275e32d2b..3f017faaee78f 100644 --- a/compiler-rt/lib/scudo/standalone/platform.h +++ b/compiler-rt/lib/scudo/standalone/platform.h @@ -21,6 +21,11 @@ // See https://android.googlesource.com/platform/bionic/+/master/docs/defines.md #if defined(__BIONIC__) #define SCUDO_ANDROID 1 +// Transitive includes of unistd.h will get PAGE_SIZE if it is defined. +#include +#if defined(PAGE_SIZE) +#define SCUDO_PAGE_SIZE PAGE_SIZE +#endif #else #define SCUDO_ANDROID 0 #endif diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h index 69f926e3f8680..51abdd82aa538 100644 --- a/compiler-rt/lib/scudo/standalone/release.h +++ b/compiler-rt/lib/scudo/standalone/release.h @@ -88,7 +88,7 @@ class FragmentationRecorder { void releasePageRangeToOS(uptr From, uptr To) { DCHECK_EQ((To - From) % getPageSizeCached(), 0U); - ReleasedPagesCount += (To - From) / getPageSizeCached(); + ReleasedPagesCount += (To - From) >> getPageSizeLogCached(); } private: @@ -348,7 +348,7 @@ class RegionPageMap { template class FreePagesRangeTracker { public: explicit FreePagesRangeTracker(ReleaseRecorderT &Recorder) - : Recorder(Recorder), PageSizeLog(getLog2(getPageSizeCached())) {} + : Recorder(Recorder) {} void processNextPage(bool Released) { if (Released) { @@ -372,6 +372,7 @@ template class FreePagesRangeTracker { private: void closeOpenedRange() { if (InRange) { + const uptr PageSizeLog = getPageSizeLogCached(); Recorder.releasePageRangeToOS((CurrentRangeStatePage << PageSizeLog), (CurrentPage << PageSizeLog)); InRange = false; @@ -379,7 +380,6 @@ template class FreePagesRangeTracker { } ReleaseRecorderT &Recorder; - const uptr PageSizeLog; bool InRange = false; uptr CurrentPage = 0; uptr CurrentRangeStatePage = 0; @@ -389,7 +389,7 @@ struct PageReleaseContext { PageReleaseContext(uptr BlockSize, uptr NumberOfRegions, uptr ReleaseSize, uptr ReleaseOffset = 0) : BlockSize(BlockSize), NumberOfRegions(NumberOfRegions) { - PageSize = getPageSizeCached(); + const uptr PageSize = getPageSizeCached(); if (BlockSize <= PageSize) { if (PageSize % BlockSize == 0) { // Same number of chunks per page, no cross overs. @@ -408,7 +408,7 @@ struct PageReleaseContext { SameBlockCountPerPage = false; } } else { - if (BlockSize % PageSize == 0) { + if ((BlockSize & (PageSize - 1)) == 0) { // One chunk covers multiple pages, no cross overs. FullPagesBlockCountMax = 1; SameBlockCountPerPage = true; @@ -427,8 +427,8 @@ struct PageReleaseContext { if (NumberOfRegions != 1) DCHECK_EQ(ReleaseOffset, 0U); - PagesCount = roundUp(ReleaseSize, PageSize) / PageSize; - PageSizeLog = getLog2(PageSize); + const uptr PageSizeLog = getPageSizeLogCached(); + PagesCount = roundUp(ReleaseSize, PageSize) >> PageSizeLog; ReleasePageOffset = ReleaseOffset >> PageSizeLog; } @@ -451,6 +451,7 @@ struct PageReleaseContext { // RegionSize, it's not necessary to be aligned with page size. bool markRangeAsAllCounted(uptr From, uptr To, uptr Base, const uptr RegionIndex, const uptr RegionSize) { + const uptr PageSize = getPageSizeCached(); DCHECK_LT(From, To); DCHECK_LE(To, Base + RegionSize); DCHECK_EQ(From % PageSize, 0U); @@ -544,6 +545,7 @@ struct PageReleaseContext { if (!ensurePageMapAllocated()) return false; + const uptr PageSize = getPageSizeCached(); if (MayContainLastBlockInRegion) { const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize; @@ -605,17 +607,19 @@ struct PageReleaseContext { return true; } - uptr getPageIndex(uptr P) { return (P >> PageSizeLog) - ReleasePageOffset; } - uptr getReleaseOffset() { return ReleasePageOffset << PageSizeLog; } + uptr getPageIndex(uptr P) { + return (P >> getPageSizeLogCached()) - ReleasePageOffset; + } + uptr getReleaseOffset() { + return ReleasePageOffset << getPageSizeLogCached(); + } uptr BlockSize; uptr NumberOfRegions; // For partial region marking, some pages in front are not needed to be // counted. uptr ReleasePageOffset; - uptr PageSize; uptr PagesCount; - uptr PageSizeLog; uptr FullPagesBlockCountMax; bool SameBlockCountPerPage; RegionPageMap PageMap; @@ -628,7 +632,7 @@ template NOINLINE void releaseFreeMemoryToOS(PageReleaseContext &Context, ReleaseRecorderT &Recorder, SkipRegionT SkipRegion) { - const uptr PageSize = Context.PageSize; + const uptr PageSize = getPageSizeCached(); const uptr BlockSize = Context.BlockSize; const uptr PagesCount = Context.PagesCount; const uptr NumberOfRegions = Context.NumberOfRegions; @@ -671,7 +675,7 @@ releaseFreeMemoryToOS(PageReleaseContext &Context, uptr PrevPageBoundary = 0; uptr CurrentBoundary = 0; if (ReleasePageOffset > 0) { - PrevPageBoundary = ReleasePageOffset * PageSize; + PrevPageBoundary = ReleasePageOffset << getPageSizeLogCached(); CurrentBoundary = roundUpSlow(PrevPageBoundary, BlockSize); } for (uptr J = 0; J < PagesCount; J++) { From 247d3ea843cb20d8d75ec781cd603c8ececf8934 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 5 Sep 2024 13:43:31 -0700 Subject: [PATCH 295/425] [SLP] Expand non-power-of-two bailout in TryToFindDuplicates This fixes a crash noticed when doing a downstream merge. The test case has been reduced, and is included in this commit. The existing bailout for non-power-of-two vectors in TryToFindDuplicates did not consider the case where the list being vectorized had no root node. This allowed reshuffled scalars to slip through to code which does not yet expect to handle it. This was an existing bug (likely introduced by my ed03070e), but made easier to hit by 63e8a1b1 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 3 +- .../SLPVectorizer/RISCV/vec3-base.ll | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 74bb529b2526e..a77d236413a96 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6989,7 +6989,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ReuseShuffleIndices.clear(); } else { // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. - if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) || + !llvm::has_single_bit(VL.size())) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 4e8e019e155db..faffe16f8e9cd 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -762,6 +762,36 @@ define double @dot_product_fp64(ptr %a, ptr %b) { ret double %add.1 } +;; Covers a case where SLP would previous crash due to a +;; missing bailout in TryToFindDuplicates for the case +;; where a VL=3 list was vectorized directly (without +;; a root instruction such as a store or reduce). +define double @no_root_reshuffle(ptr %ptr) { +; CHECK-LABEL: @no_root_reshuffle( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16 +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]] +; CHECK-NEXT: ret double [[ADD]] +; +entry: + %0 = load double, ptr %ptr, align 8 + %mul = fmul fast double %0, %0 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8 + %1 = load double, ptr %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16 + %2 = load double, ptr %arrayidx3, align 8 + %3 = fmul fast double %2, %2 + %mul6 = fmul fast double %3, %1 + %add = fadd fast double %mul6, %mul + ret double %add +} declare float @llvm.fmuladd.f32(float, float, float) From 24684bb4a9791145a36a97477eb1fd525a122d8e Mon Sep 17 00:00:00 2001 From: Evgenii Stepanov Date: Thu, 5 Sep 2024 14:09:33 -0700 Subject: [PATCH 296/425] [sanitizer] Delay sanitizer args parsing (#107280) Delay sanitizer arg parsing until after -Xclang flags are forwarded to the clang command line. This allows the check in hasTargetFeatureMTE to pick up manually specified target feature, and enables the following: -march=armv8-a -Xclang -target-feature -Xclang +mte -fsanitize=memtag-stack --- clang/lib/Driver/ToolChains/Clang.cpp | 6 ++++-- clang/test/Driver/fsanitize.c | 4 +++- clang/test/Driver/fuchsia.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 90a747ca58986..3fe4ce5d893b8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6786,8 +6786,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("--offload-new-driver"); } - SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); - const XRayArgs &XRay = TC.getXRayArgs(); XRay.addArgs(TC, Args, CmdArgs, InputType); @@ -7677,6 +7675,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } + // This needs to run after -Xclang argument forwarding to pick up the target + // features enabled through -Xclang -target-feature flags. + SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); + // With -save-temps, we want to save the unoptimized bitcode output from the // CompileJobAction, use -disable-llvm-passes to get pristine IR generated // by the frontend. diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index f86c978f221cd..6ecf0b57bee5c 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -197,6 +197,8 @@ // CHECK-SANMT-MT: "-target-feature" "+mte" // CHECK-SANMT-MT-SAME: "-fsanitize=memtag-stack,memtag-heap,memtag-globals" +// RUN: not %clang --target=aarch64-linux -fsanitize=memtag -Xclang -target-feature -Xclang +mte %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-MT + // RUN: not %clang --target=aarch64-linux -fsanitize=memtag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-NOMT-0 // CHECK-SANMT-NOMT-0: '-fsanitize=memtag-stack' requires hardware support (+memtag) @@ -726,8 +728,8 @@ // NO-SP-NOT: stack-protector // NO-SP: "-fsanitize=safe-stack" // SP-ASAN: error: invalid argument '-fsanitize=safe-stack' not allowed with '-fsanitize=address' -// SP: "-fsanitize=safe-stack" // SP: -stack-protector +// SP: "-fsanitize=safe-stack" // NO-SP-NOT: stack-protector // RUN: %clang --target=powerpc64-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c index c67f7f8c005b3..83dee16981690 100644 --- a/clang/test/Driver/fuchsia.c +++ b/clang/test/Driver/fuchsia.c @@ -30,10 +30,10 @@ // CHECK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]" // CHECK: "-internal-externc-isystem" "[[SYSROOT]]{{/|\\\\}}include" +// CHECK: "-stack-protector" "2" // CHECK-AARCH64: "-fsanitize=shadow-call-stack" // CHECK-RISCV64: "-fsanitize=shadow-call-stack" // CHECK-X86_64: "-fsanitize=safe-stack" -// CHECK: "-stack-protector" "2" // CHECK-AARCH64: "-target-feature" "+outline-atomics" // CHECK-NOT: "-fcommon" // CHECK: {{.*}}ld.lld{{.*}}" "-z" "max-page-size=4096" "-z" "now" "-z" "start-stop-visibility=hidden" "-z" "rodynamic" "-z" "separate-loadable-segments" "-z" "rel" "--pack-dyn-relocs=relr" From 3836d4acccbe87216133d08d75df509e95c291f0 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 14:27:31 -0700 Subject: [PATCH 297/425] [lldb] Convert ConnectionGenericFileWindows.cpp to new Status API (NFC) --- lldb/source/Host/windows/ConnectionGenericFileWindows.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp index fd78a625d9cb9..170444f56b310 100644 --- a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp +++ b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp @@ -236,7 +236,7 @@ size_t ConnectionGenericFile::Read(void *dst, size_t dst_len, finish: status = return_info.GetStatus(); if (error_ptr) - *error_ptr = return_info.GetError(); + *error_ptr = Status::FromError(return_info.GetError()); // kBytesAvailableEvent is a manual reset event. Make sure it gets reset // here so that any subsequent operations don't immediately see bytes @@ -290,7 +290,7 @@ size_t ConnectionGenericFile::Write(const void *src, size_t src_len, finish: status = return_info.GetStatus(); if (error_ptr) - *error_ptr = return_info.GetError(); + *error_ptr = Status::FromError(return_info.GetError()); IncrementFilePointer(return_info.GetBytes()); Log *log = GetLog(LLDBLog::Connection); From f00c946c2da0caf6da4a49e87ac905a8b1d2e8b6 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 14:28:28 -0700 Subject: [PATCH 298/425] [lldb] Convert MainLoopWindows.cpp to new Status API (NFC) --- lldb/source/Host/windows/MainLoopWindows.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp index 551e73e6904ae..88d929535ab6c 100644 --- a/lldb/source/Host/windows/MainLoopWindows.cpp +++ b/lldb/source/Host/windows/MainLoopWindows.cpp @@ -121,7 +121,7 @@ Status MainLoopWindows::Run() { llvm::Expected signaled_event = Poll(); if (!signaled_event) - return Status(signaled_event.takeError()); + return Status::FromError(signaled_event.takeError()); if (*signaled_event < m_read_fds.size()) { auto &KV = *std::next(m_read_fds.begin(), *signaled_event); From 64498c54831bed9cf069e0923b9b73678c6451d8 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Thu, 5 Sep 2024 14:49:03 -0700 Subject: [PATCH 299/425] [LTO][ELF][lld] Use unique string saver in ELF bitcode symbol parsing (#106670) lld ELF [BitcodeFile](https://github.com/llvm/llvm-project/blob/a527248a3c2d638b0c92a06992f3f1c1f80842ad/lld/ELF/InputFiles.h#L328) uses [string saver](https://github.com/llvm/llvm-project/blob/a527248a3c2d638b0c92a06992f3f1c1f80842ad/lld/include/lld/Common/CommonLinkerContext.h#L57) to keep copies of bitcode symbols. Symbol duplication is very common when compiling application binaries. This change proposes to introduce a UniqueStringSaver in lld context and use it for bitcode symbol parsing. The implementation covers ELF only. Similar opportunities should exist on other (COFF, MachO, wasm) formats. For an internal production binary where lto indexing takes ~10GiB originally, this changes optimizes away ~800MiB (~7.8%), measured by https://github.com/google/pprof. Flame graph breaks down memory by usage call stacks and agrees with this measurement. --- lld/ELF/InputFiles.cpp | 8 ++++++-- lld/include/lld/Common/CommonLinkerContext.h | 8 +++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 7adc35f20984a..1570adf137093 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1744,8 +1744,10 @@ createBitcodeSymbol(Symbol *&sym, const std::vector &keptComdats, uint8_t type = objSym.isTLS() ? STT_TLS : STT_NOTYPE; uint8_t visibility = mapVisibility(objSym.getVisibility()); + // Symbols can be duplicated in bitcode files because of '#include' and + // linkonce_odr. Use unique_saver to save symbol names for de-duplication. if (!sym) - sym = symtab.insert(saver().save(objSym.getName())); + sym = symtab.insert(unique_saver().save(objSym.getName())); int c = objSym.getComdatIndex(); if (objSym.isUndefined() || (c != -1 && !keptComdats[c])) { @@ -1797,7 +1799,9 @@ void BitcodeFile::parseLazy() { symbols = std::make_unique(numSymbols); for (auto [i, irSym] : llvm::enumerate(obj->symbols())) if (!irSym.isUndefined()) { - auto *sym = symtab.insert(saver().save(irSym.getName())); + // Symbols can be duplicated in bitcode files because of '#include' and + // linkonce_odr. Use unique_saver to save symbol names for de-duplication. + auto *sym = symtab.insert(unique_saver().save(irSym.getName())); sym->resolve(LazySymbol{*this}); symbols[i] = sym; } diff --git a/lld/include/lld/Common/CommonLinkerContext.h b/lld/include/lld/Common/CommonLinkerContext.h index 0627bbdc8bd87..9970dfcb713f8 100644 --- a/lld/include/lld/Common/CommonLinkerContext.h +++ b/lld/include/lld/Common/CommonLinkerContext.h @@ -38,6 +38,7 @@ class CommonLinkerContext { llvm::BumpPtrAllocator bAlloc; llvm::StringSaver saver{bAlloc}; + llvm::UniqueStringSaver unique_saver{bAlloc}; llvm::DenseMap instances; ErrorHandler e; @@ -54,8 +55,13 @@ template T &context() { bool hasContext(); -inline llvm::StringSaver &saver() { return context().saver; } inline llvm::BumpPtrAllocator &bAlloc() { return context().bAlloc; } +inline llvm::StringSaver &saver() { return context().saver; } +inline llvm::UniqueStringSaver &unique_saver() { + // FIXME: Look into other places where duplications are common in saved + // strings and unique saver make sense. + return context().unique_saver; +} } // namespace lld #endif From 50be455ab88b17872cd620698156b4058dc92f58 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 5 Sep 2024 14:52:30 -0700 Subject: [PATCH 300/425] [TableGen] Add check for number of intrinsic return values (#107326) Fail if we see an intrinsic that returns more than the supported number of return values. Intrinsics can return only upto a certain nyumber of values, as defined by the `IIT_RetNumbers` list in `Intrinsics.td`. Currently, if we define an intrinsic that exceeds the limit, llvm-tblgen crashes. Instead, read this limit and fail if it's exceeded with a proper error message. --- llvm/test/TableGen/intrinsic-struct.td | 25 +++++++++++++------ .../TableGen/Basic/CodeGenIntrinsics.cpp | 17 ++++++++++++- llvm/utils/TableGen/Basic/CodeGenIntrinsics.h | 3 +++ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td index f23a7a7643af2..467fd9057c183 100644 --- a/llvm/test/TableGen/intrinsic-struct.td +++ b/llvm/test/TableGen/intrinsic-struct.td @@ -1,11 +1,22 @@ -// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s +// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s --check-prefix=CHECK-ENUM +// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS > /dev/null 2>&1 +// RUN: not llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS -DENABLE_ERROR 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + // XFAIL: vg_leak include "llvm/IR/Intrinsics.td" -// Make sure we can return up to 8 values -// CHECK: returns_8_results = {{[0-9]+}}, // llvm.returns.8.results -def int_returns_8_results : Intrinsic< - [llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, - llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty], - [], [], "llvm.returns.8.results">; +// Make sure we can return up to 9 values. +// CHECK-ENUM: returns_9_results = {{[0-9]+}}, // llvm.returns.9.results +def int_returns_9_results : Intrinsic< + !listsplat(llvm_anyint_ty, 9), + [], [], "llvm.returns.9.results">; + +#ifdef ENABLE_ERROR +// CHECK-ERROR: error: intrinsics can only return upto 9 values, 'int_returns_10_results' returns 10 values +// CHECK-ERROR-NEXT: def int_returns_10_results : Intrinsic< +def int_returns_10_results : Intrinsic< + !listsplat(llvm_anyint_ty, 10), + [], [], "llvm.returns.10.results">; + +#endif diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index a30a7577408f8..23c64912c780f 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -29,6 +29,15 @@ CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) { for (const Record *Rec : RC.getAllDerivedDefinitions("IntrinsicProperty")) if (Rec->getValueAsBit("IsDefault")) DefaultProperties.push_back(Rec); + + // The maximum number of values that an intrinsic can return is the size of + // of `IIT_RetNumbers` list - 1 (since we index into this list using the + // number of return values as the index). + const auto *IIT_RetNumbers = + dyn_cast_or_null(RC.getGlobal("IIT_RetNumbers")); + if (!IIT_RetNumbers) + PrintFatalError("unable to find 'IIT_RetNumbers' list"); + MaxNumReturn = IIT_RetNumbers->size() - 1; } CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { @@ -106,6 +115,13 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, TargetPrefix + ".'!"); } + unsigned NumRet = R->getValueAsListInit("RetTypes")->size(); + if (NumRet > Ctx.MaxNumReturn) + PrintFatalError(DefLoc, "intrinsics can only return upto " + + Twine(Ctx.MaxNumReturn) + " values, '" + + DefName + "' returns " + Twine(NumRet) + + " values"); + const Record *TypeInfo = R->getValueAsDef("TypeInfo"); if (!TypeInfo->isSubClassOf("TypeInfoGen")) PrintFatalError(DefLoc, "TypeInfo field in " + DefName + @@ -116,7 +132,6 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, // Types field is a concatenation of Return types followed by Param types. unsigned Idx = 0; - unsigned NumRet = R->getValueAsListInit("RetTypes")->size(); for (; Idx < NumRet; ++Idx) IS.RetTys.push_back(TypeList->getElementAsRecord(Idx)); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 51c2359155380..83282d18789b2 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -30,6 +30,9 @@ class RecordKeeper; struct CodeGenIntrinsicContext { explicit CodeGenIntrinsicContext(const RecordKeeper &RC); std::vector DefaultProperties; + + // Maximum number of values an intrinsic can return. + unsigned MaxNumReturn; }; struct CodeGenIntrinsic { From 3380dae2f0d6b8035744da573c4508b98c80045c Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 5 Sep 2024 14:55:05 -0700 Subject: [PATCH 301/425] [lld][InstrProf] Refactor BPSectionOrderer.cpp (#107347) Refactor some code in `BPSectionOrderer.cpp` in preparation for https://github.com/llvm/llvm-project/pull/107348. * Rename `constructNodesForCompression()` -> `getUnsForCompression()` and return a `SmallVector` directly rather than populating a vector alias * Pass `duplicateSectionIdxs` as a pointer to make it possible to skip finding (nearly) duplicate sections * Combine `duplicate{Function,Data}SectionIdxs` into one variable * Compute all `BPFunctionNode` vectors at the end (like `nodesForStartup`) There should be no functional change. --- lld/MachO/BPSectionOrderer.cpp | 124 ++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp index 568843d72bbb5..97458c96fd80d 100644 --- a/lld/MachO/BPSectionOrderer.cpp +++ b/lld/MachO/BPSectionOrderer.cpp @@ -21,6 +21,8 @@ using namespace llvm; using namespace lld::macho; +using UtilityNodes = SmallVector; + /// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and /// "yyyy" are numbers that could change between builds. We need to use the root /// symbol name before this suffix so these symbols can be matched with profiles @@ -60,12 +62,15 @@ getRelocHash(const Reloc &reloc, return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend); } -static void constructNodesForCompression( - const SmallVector §ions, +/// Given \p sectionIdxs, a list of section indexes, return a list of utility +/// nodes for each section index. If \p duplicateSectionIdx is provided, +/// populate it with nearly identical sections. Increment \p maxUN to be the +/// largest utility node we have used so far. +static SmallVector> getUnsForCompression( + ArrayRef sections, const DenseMap §ionToIdx, - const SmallVector §ionIdxs, - std::vector &nodes, - DenseMap> &duplicateSectionIdxs, + ArrayRef sectionIdxs, + DenseMap> *duplicateSectionIdxs, BPFunctionNode::UtilityNodeT &maxUN) { TimeTraceScope timeScope("Build nodes for compression"); @@ -103,49 +108,52 @@ static void constructNodesForCompression( for (auto hash : hashes) ++hashFrequency[hash]; - // Merge section that are nearly identical - SmallVector>> newSectionHashes; - DenseMap wholeHashToSectionIdx; - for (auto &[sectionIdx, hashes] : sectionHashes) { - uint64_t wholeHash = 0; - for (auto hash : hashes) - if (hashFrequency[hash] > 5) - wholeHash ^= hash; - auto [it, wasInserted] = - wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx)); - if (wasInserted) { - newSectionHashes.emplace_back(sectionIdx, hashes); - } else { - duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx); + if (duplicateSectionIdxs) { + // Merge section that are nearly identical + SmallVector>> newSectionHashes; + DenseMap wholeHashToSectionIdx; + for (auto &[sectionIdx, hashes] : sectionHashes) { + uint64_t wholeHash = 0; + for (auto hash : hashes) + if (hashFrequency[hash] > 5) + wholeHash ^= hash; + auto [it, wasInserted] = + wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx)); + if (wasInserted) { + newSectionHashes.emplace_back(sectionIdx, hashes); + } else { + (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx); + } } - } - sectionHashes = newSectionHashes; + sectionHashes = newSectionHashes; - // Recompute hash frequencies - hashFrequency.clear(); - for (auto &[sectionIdx, hashes] : sectionHashes) - for (auto hash : hashes) - ++hashFrequency[hash]; + // Recompute hash frequencies + hashFrequency.clear(); + for (auto &[sectionIdx, hashes] : sectionHashes) + for (auto hash : hashes) + ++hashFrequency[hash]; + } // Filter rare and common hashes and assign each a unique utility node that // doesn't conflict with the trace utility nodes DenseMap hashToUN; for (auto &[hash, frequency] : hashFrequency) { - if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size()) + if (frequency <= 1 || frequency * 2 > sectionHashes.size()) continue; hashToUN[hash] = ++maxUN; } - std::vector uns; + SmallVector> sectionUns; for (auto &[sectionIdx, hashes] : sectionHashes) { + UtilityNodes uns; for (auto &hash : hashes) { auto it = hashToUN.find(hash); if (it != hashToUN.end()) uns.push_back(it->second); } - nodes.emplace_back(sectionIdx, uns); - uns.clear(); + sectionUns.emplace_back(sectionIdx, uns); } + return sectionUns; } DenseMap lld::macho::runBalancedPartitioning( @@ -185,10 +193,10 @@ DenseMap lld::macho::runBalancedPartitioning( sectionIdxs.end()); } - std::vector nodesForStartup; BPFunctionNode::UtilityNodeT maxUN = 0; - DenseMap> - startupSectionIdxUNs; + DenseMap startupSectionIdxUNs; + // Used to define the initial order for startup functions. + DenseMap sectionIdxToTimestamp; std::unique_ptr reader; if (!profilePath.empty()) { auto fs = vfs::getRealFileSystem(); @@ -202,8 +210,6 @@ DenseMap lld::macho::runBalancedPartitioning( } auto &traces = reader->getTemporalProfTraces(); - // Used to define the initial order for startup functions. - DenseMap sectionIdxToTimestamp; DenseMap sectionIdxToFirstUN; for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) { uint64_t currentSize = 0, cutoffSize = 1; @@ -245,15 +251,6 @@ DenseMap lld::macho::runBalancedPartitioning( ++maxUN; sectionIdxToFirstUN.clear(); } - - // These uns should already be sorted without duplicates. - for (auto &[sectionIdx, uns] : startupSectionIdxUNs) - nodesForStartup.emplace_back(sectionIdx, uns); - - llvm::sort(nodesForStartup, [§ionIdxToTimestamp](auto &L, auto &R) { - return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) < - std::make_pair(sectionIdxToTimestamp[R.Id], R.Id); - }); } SmallVector sectionIdxsForFunctionCompression, @@ -271,21 +268,32 @@ DenseMap lld::macho::runBalancedPartitioning( } } - std::vector nodesForFunctionCompression, - nodesForDataCompression; // Map a section index (to be ordered for compression) to a list of duplicate // section indices (not ordered for compression). - DenseMap> duplicateFunctionSectionIdxs, - duplicateDataSectionIdxs; - constructNodesForCompression( + DenseMap> duplicateSectionIdxs; + auto unsForFunctionCompression = getUnsForCompression( sections, sectionToIdx, sectionIdxsForFunctionCompression, - nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN); - constructNodesForCompression( + &duplicateSectionIdxs, maxUN); + auto unsForDataCompression = getUnsForCompression( sections, sectionToIdx, sectionIdxsForDataCompression, - nodesForDataCompression, duplicateDataSectionIdxs, maxUN); + &duplicateSectionIdxs, maxUN); - // Sort nodes by their Id (which is the section index) because the input - // linker order tends to be not bad + std::vector nodesForStartup, nodesForFunctionCompression, + nodesForDataCompression; + for (auto &[sectionIdx, uns] : startupSectionIdxUNs) + nodesForStartup.emplace_back(sectionIdx, uns); + for (auto &[sectionIdx, uns] : unsForFunctionCompression) + nodesForFunctionCompression.emplace_back(sectionIdx, uns); + for (auto &[sectionIdx, uns] : unsForDataCompression) + nodesForDataCompression.emplace_back(sectionIdx, uns); + + // Use the first timestamp to define the initial order for startup nodes. + llvm::sort(nodesForStartup, [§ionIdxToTimestamp](auto &L, auto &R) { + return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) < + std::make_pair(sectionIdxToTimestamp[R.Id], R.Id); + }); + // Sort compression nodes by their Id (which is the section index) because the + // input linker order tends to be not bad. llvm::sort(nodesForFunctionCompression, [](auto &L, auto &R) { return L.Id < R.Id; }); llvm::sort(nodesForDataCompression, @@ -318,8 +326,8 @@ DenseMap lld::macho::runBalancedPartitioning( if (orderedSections.insert(isec)) ++numCodeCompressionSections; - auto It = duplicateFunctionSectionIdxs.find(node.Id); - if (It == duplicateFunctionSectionIdxs.end()) + auto It = duplicateSectionIdxs.find(node.Id); + if (It == duplicateSectionIdxs.end()) continue; for (auto dupSecIdx : It->getSecond()) { const auto *dupIsec = sections[dupSecIdx]; @@ -332,8 +340,8 @@ DenseMap lld::macho::runBalancedPartitioning( const auto *isec = sections[node.Id]; if (orderedSections.insert(isec)) ++numDataCompressionSections; - auto It = duplicateDataSectionIdxs.find(node.Id); - if (It == duplicateDataSectionIdxs.end()) + auto It = duplicateSectionIdxs.find(node.Id); + if (It == duplicateSectionIdxs.end()) continue; for (auto dupSecIdx : It->getSecond()) { const auto *dupIsec = sections[dupSecIdx]; From 7ea9f0d85fc3dc80b45e6ba7087c41c6f2481f07 Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Thu, 5 Sep 2024 14:51:14 -0700 Subject: [PATCH 302/425] [SandboxVec][NFC] Remove unused header files --- .../Vectorize/SandboxVectorizer/SandboxVectorizer.h | 7 ++----- .../Vectorize/SandboxVectorizer/SandboxVectorizer.cpp | 3 ++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h index 0d4dbae44521e..dd9f02d327264 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h @@ -8,15 +8,12 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/PassManager.h" -#include "llvm/SandboxIR/SandboxIR.h" namespace llvm { +class TargetTransformInfo; + class SandboxVectorizerPass : public PassInfoMixin { TargetTransformInfo *TTI = nullptr; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index 072a6606694a0..ec4bfbb56ecb4 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -7,7 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/SandboxIR/SandboxIR.h" using namespace llvm; From 73514f6831cfcea49f33fb9e31db0141b05532f2 Mon Sep 17 00:00:00 2001 From: wldfngrs Date: Thu, 5 Sep 2024 23:04:35 +0100 Subject: [PATCH 303/425] [libc] Add proxy header for __sighandler_t type (#107354) Added proxy headers for __sighandler_t type, modified the corresponding CMakeLists.txt files and test files --- libc/hdr/types/CMakeLists.txt | 10 ++++++++++ libc/hdr/types/sighandler_t.h | 24 ++++++++++++++++++++++++ libc/src/signal/linux/signal.cpp | 1 + libc/src/signal/signal.h | 4 +--- libc/test/src/signal/CMakeLists.txt | 2 +- libc/test/src/signal/signal_test.cpp | 4 ++-- 6 files changed, 39 insertions(+), 6 deletions(-) create mode 100644 libc/hdr/types/sighandler_t.h diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index f41576c07d99b..ea7bbccffbb81 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -171,3 +171,13 @@ add_proxy_header_library( libc.include.llvm-libc-types.locale_t libc.include.locale ) + +add_proxy_header_library( + __sighandler_t + HDRS + sighandler_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.__sighandler_t + libc.include.signal +) + diff --git a/libc/hdr/types/sighandler_t.h b/libc/hdr/types/sighandler_t.h new file mode 100644 index 0000000000000..b18f8e856c5b6 --- /dev/null +++ b/libc/hdr/types/sighandler_t.h @@ -0,0 +1,24 @@ +//===-- Definition of macros from __sighandler_t.h ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_SIGHANDLER_T_H +#define LLVM_LIBC_HDR_SIGHANDLER_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/__sighandler_t.h" + +using sighandler_t = __sighandler_t; + +#else // overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_SIGHANDLER_T_H diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp index 241258faf2e81..1da0ef8c97a20 100644 --- a/libc/src/signal/linux/signal.cpp +++ b/libc/src/signal/linux/signal.cpp @@ -8,6 +8,7 @@ #include "src/signal/signal.h" #include "hdr/signal_macros.h" +#include "hdr/types/sighandler_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/signal/sigaction.h" diff --git a/libc/src/signal/signal.h b/libc/src/signal/signal.h index 2037305f8c11e..06e77e11bf0bd 100644 --- a/libc/src/signal/signal.h +++ b/libc/src/signal/signal.h @@ -9,13 +9,11 @@ #ifndef LLVM_LIBC_SRC_SIGNAL_SIGNAL_H #define LLVM_LIBC_SRC_SIGNAL_SIGNAL_H +#include "hdr/types/sighandler_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { -using sighandler_t = __sighandler_t; - sighandler_t signal(int signum, sighandler_t handler); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index edbd5c19edab3..f7923204eaf49 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -74,7 +74,7 @@ add_libc_unittest( SRCS signal_test.cpp DEPENDS - libc.include.signal + libc.hdr.types.sighandler_t libc.src.errno.errno libc.src.signal.raise libc.src.signal.signal diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index 70e95a8c159a8..4b57311eee2d8 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -13,14 +13,14 @@ #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include +#include "hdr/types/sighandler_t.h" using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcSignal, Invalid) { LIBC_NAMESPACE::libc_errno = 0; - LIBC_NAMESPACE::sighandler_t valid = +[](int) {}; + sighandler_t valid = +[](int) {}; EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid), Fails(EINVAL, (void *)SIG_ERR)); EXPECT_THAT((void *)LIBC_NAMESPACE::signal(65, valid), From e44a67543c0b6a3a2307362f5bbcf54cd6de6a8e Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 5 Sep 2024 15:05:15 -0700 Subject: [PATCH 304/425] AMDGPU: Add a few unsupported checks for llvm.fptrunc.round intrinsic (#107330) A check here can be removed when we implement support for the corresponding types/mode. --- .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll | 55 +++++++++++++++++-- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll index f1d5b07e832c4..291fe00a6177b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll @@ -1,11 +1,54 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL -; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL +; RUN: split-file %s %t -define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) { -; FAIL: LLVM ERROR: Cannot select +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s + +; TODO: check for GISEL when bfloat is supported. +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f32-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F32-FAIL %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F64-FAIL %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s + +;--- f16-f64-err.ll +define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %out) { +; F16-F64-FAIL: LLVM ERROR: Cannot select %res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward") - store half %res, ptr addrspace(1) %out, align 4 + store half %res, ptr addrspace(1) %out, align 2 ret void } -declare half @llvm.fptrunc.round.f16.f64(double, metadata) +;--- f32-f64-err.ll +define amdgpu_gs void @test_fptrunc_round_f32_f64(double %a, ptr addrspace(1) %out) { +; F32-F64-FAIL: LLVM ERROR: Cannot select + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward") + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +;--- bf16-f32-err.ll +define amdgpu_gs void @test_fptrunc_round_bf16_f32(float %a, ptr addrspace(1) %out) { +; BF16-F32-FAIL: LLVM ERROR: Cannot select + %res = call bfloat @llvm.fptrunc.round.bf16.f32(float %a, metadata !"round.towardzero") + store bfloat %res, ptr addrspace(1) %out, align 2 + ret void +} + +;--- bf16-f64-err.ll +define amdgpu_gs void @test_fptrunc_round_bf16_f64(double %a, ptr addrspace(1) %out) { +; BF16-F64-FAIL: LLVM ERROR: Cannot select + %res = call bfloat @llvm.fptrunc.round.bf16.f32(double %a, metadata !"round.tonearest") + store bfloat %res, ptr addrspace(1) %out, align 2 + ret void +} + +;--- f16-f32-tonearestaway-err.ll +define amdgpu_gs void @test_fptrunc_round_f16_f32_tonearestaway(float %a, ptr addrspace(1) %out) { +; TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select + %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearestaway") + store half %res, ptr addrspace(1) %out, align 2 + ret void +} From 1e98aa4730b1b3b93205af74be26e04d5f876d10 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 15:09:05 -0700 Subject: [PATCH 305/425] [lldb] Convert ConnectionGenericFileWindows.cpp to new Status API (NFC) --- lldb/source/Host/windows/ConnectionGenericFileWindows.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp index 170444f56b310..6a6153d6e34a0 100644 --- a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp +++ b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp @@ -236,7 +236,7 @@ size_t ConnectionGenericFile::Read(void *dst, size_t dst_len, finish: status = return_info.GetStatus(); if (error_ptr) - *error_ptr = Status::FromError(return_info.GetError()); + *error_ptr = return_info.GetError().Clone(); // kBytesAvailableEvent is a manual reset event. Make sure it gets reset // here so that any subsequent operations don't immediately see bytes @@ -290,7 +290,7 @@ size_t ConnectionGenericFile::Write(const void *src, size_t src_len, finish: status = return_info.GetStatus(); if (error_ptr) - *error_ptr = Status::FromError(return_info.GetError()); + *error_ptr = return_info.GetError().Clone(); IncrementFilePointer(return_info.GetBytes()); Log *log = GetLog(LLDBLog::Connection); From bd840a40042c2c67f56079493d0bcdbfc70325ba Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 5 Sep 2024 15:14:31 -0700 Subject: [PATCH 306/425] [AMDGPU] Add target intrinsic for s_prefetch_data (#107133) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 3 + .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 26 +++- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++ .../AMDGPU/AMDGPUInstructionSelector.cpp | 7 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 15 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SMInstructions.td | 22 +++ .../AMDGPU/llvm.amdgcn.s.prefetch.data.ll | 136 ++++++++++++++++++ 10 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ab29ef38f7792..5060647d35764 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 02d8726baa421..da7a1a55da531 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); } + case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: + return emitBuiltinWithOneOverloadedType<2>( + *this, E, Intrinsic::amdgcn_s_prefetch_data); default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index d9ec258e644c9..34ee44afe0f10 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -256,4 +256,28 @@ void test_s_ttracedata_imm() __builtin_amdgcn_s_ttracedata_imm(1); } - +// CHECK-LABEL: @test_s_prefetch_data( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5) +// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31) +// CHECK-NEXT: ret void +// +void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len) +{ + __builtin_amdgcn_s_prefetch_data(fp, 0); + __builtin_amdgcn_s_prefetch_data(gp, len); + __builtin_amdgcn_s_prefetch_data(cp, 31); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index dc13a35c66f9a..a5259ba9eec36 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic; def int_amdgcn_wave_id : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef, IntrNoMem, IntrSpeculatable]>; +def int_amdgcn_s_prefetch_data : + Intrinsic<[], + [llvm_anyptr_ty, // Pointer to a constant/global memory + llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes) + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] + >; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3fcb364fc2c53..9bebd418bb426 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { - MIB.addImm(MI.getOperand(OpIdx).getImm()); + const MachineOperand &Op = MI.getOperand(OpIdx); + int64_t Imm; + if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm))) + MIB.addImm(Imm); + else + MIB.addImm(Op.getImm()); } void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 4737a322c255f..a2e6842b760f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); return; } + case Intrinsic::amdgcn_s_prefetch_data: { + Register PtrReg = MI.getOperand(1).getReg(); + unsigned AS = MRI.getType(PtrReg).getAddressSpace(); + if (AMDGPU::isFlatGlobalAddrSpace(AS)) { + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + } else + MI.eraseFromParent(); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_pops_exiting_wave_id: return getDefaultMappingSOP(MI); + case Intrinsic::amdgcn_s_prefetch_data: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 81b52935ddf39..accc3084217f2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_s_prefetch_data: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } default: return false; } @@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + case Intrinsic::amdgcn_s_prefetch_data: { + // For non-global address space preserve the chain and remove the call. + if (!AMDGPU::isFlatGlobalAddrSpace(cast(Op)->getAddressSpace())) + return Op.getOperand(0); + return Op; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 844f62abc2671..90e11df500bc9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, SBase->setReg(SGPR); } MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); - if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) { Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); SOff->setReg(SGPR); } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 9fc570bb85f24..e7db4f49d9e54 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat { defm : SMPrefetchPat<"INST", i32imm_zero>; defm : SMPrefetchPat<"DATA", i32imm_one>; +let SubtargetPredicate = isGFX12Plus in { + def : GCNPat < + (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)), + (S_PREFETCH_DATA $sbase, $offset, $len, 0) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)), + (S_PREFETCH_DATA $sbase, 0, $len, 0) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len), + (S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len)) + >; + + def : GCNPat < + (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len), + (S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len)) + >; +} // End let SubtargetPredicate = isGFX12Plus + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll new file mode 100644 index 0000000000000..54c39d78adb58 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0 +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0 +; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) { +; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) { +; GCN-LABEL: prefetch_data_vgpr_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0) + ret void +} + +declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) +declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len) +declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len) From c1c42518c1356e78a10bf252a4a5a643b2bb9efd Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 5 Sep 2024 15:25:50 -0700 Subject: [PATCH 307/425] [SandboxVec] Early return checks (#107465) This patch implements a couple of early return checks. --- .../SandboxVectorizer/SandboxVectorizer.cpp | 11 +++++++++ .../X86/no_implicit_float.ll | 23 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ec4bfbb56ecb4..e9be6f5283fea 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -29,7 +29,18 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F, } bool SandboxVectorizerPass::runImpl(Function &F) { + // If the target claims to have no vector registers early return. + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { + LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n"); + return false; + } LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n"); + // Early return if the attribute NoImplicitFloat is used. + if (F.hasFnAttribute(Attribute::NoImplicitFloat)) { + LLVM_DEBUG(dbgs() << "SBVec: NoImplicitFloat attribute, return.\n"); + return false; + } + sandboxir::Context Ctx(F.getContext()); // Create SandboxIR for `F`. sandboxir::Function &SBF = *Ctx.createFunction(&F); diff --git a/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll b/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll new file mode 100644 index 0000000000000..37fc03cb31166 --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sandbox-vectorizer -mtriple=x86_64-- -mattr=+avx %s -S | FileCheck %s + +; Check that we don't vectorize a NoImplicitFloat function. +define void @no_implicit_float(ptr %ptr) noimplicitfloat { +; CHECK-LABEL: define void @no_implicit_float( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr double, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr double, ptr [[PTR]], i32 1 +; CHECK-NEXT: [[LD0:%.*]] = load double, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[LD1:%.*]] = load double, ptr [[PTR1]], align 8 +; CHECK-NEXT: store double [[LD0]], ptr [[PTR0]], align 8 +; CHECK-NEXT: store double [[LD1]], ptr [[PTR1]], align 8 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr double, ptr %ptr, i32 0 + %ptr1 = getelementptr double, ptr %ptr, i32 1 + %ld0 = load double, ptr %ptr0 + %ld1 = load double, ptr %ptr1 + store double %ld0, ptr %ptr0 + store double %ld1, ptr %ptr1 + ret void +} From 169d453429ca9015046b42719ff5d13cda5d2c6f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 15:47:28 -0700 Subject: [PATCH 308/425] [ADT] Declare replaceAllocation in SmallVector.cpp (NFC) (#107469) This patch changes replaceAllocation to a static function while moving the declaration to SmallVector.cpp. Note that: - replaceAllocation is used only within SmallVector.cpp. - replaceAllocation doesn't access any class members. --- llvm/include/llvm/ADT/SmallVector.h | 13 ------------- llvm/lib/Support/SmallVector.cpp | 16 ++++++++++++---- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h index 730f84ca038d6..bd3e887e36bce 100644 --- a/llvm/include/llvm/ADT/SmallVector.h +++ b/llvm/include/llvm/ADT/SmallVector.h @@ -74,19 +74,6 @@ template class SmallVectorBase { /// This function will report a fatal error if it cannot increase capacity. void grow_pod(void *FirstEl, size_t MinSize, size_t TSize); - /// If vector was first created with capacity 0, getFirstEl() points to the - /// memory right after, an area unallocated. If a subsequent allocation, - /// that grows the vector, happens to return the same pointer as getFirstEl(), - /// get a new allocation, otherwise isSmall() will falsely return that no - /// allocation was done (true) and the memory will not be freed in the - /// destructor. If a VSize is given (vector size), also copy that many - /// elements to the new allocation - used if realloca fails to increase - /// space, and happens to allocate precisely at BeginX. - /// This is unlikely to be called often, but resolves a memory leak when the - /// situation does occur. - void *replaceAllocation(void *NewElts, size_t TSize, size_t NewCapacity, - size_t VSize = 0); - public: size_t size() const { return Size; } size_t capacity() const { return Capacity; } diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp index b6ce37842040b..dceea4fbc630e 100644 --- a/llvm/lib/Support/SmallVector.cpp +++ b/llvm/lib/Support/SmallVector.cpp @@ -108,10 +108,18 @@ static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) { return std::clamp(NewCapacity, MinSize, MaxSize); } -template -void *SmallVectorBase::replaceAllocation(void *NewElts, size_t TSize, - size_t NewCapacity, - size_t VSize) { +/// If vector was first created with capacity 0, getFirstEl() points to the +/// memory right after, an area unallocated. If a subsequent allocation, +/// that grows the vector, happens to return the same pointer as getFirstEl(), +/// get a new allocation, otherwise isSmall() will falsely return that no +/// allocation was done (true) and the memory will not be freed in the +/// destructor. If a VSize is given (vector size), also copy that many +/// elements to the new allocation - used if realloca fails to increase +/// space, and happens to allocate precisely at BeginX. +/// This is unlikely to be called often, but resolves a memory leak when the +/// situation does occur. +static void *replaceAllocation(void *NewElts, size_t TSize, size_t NewCapacity, + size_t VSize = 0) { void *NewEltsReplace = llvm::safe_malloc(NewCapacity * TSize); if (VSize) memcpy(NewEltsReplace, NewElts, VSize * TSize); From 6d3725924fe6adf0d490697327938de9c3516cbe Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Fri, 6 Sep 2024 07:03:05 +0800 Subject: [PATCH 309/425] [clang-tidy][NFC] remove autosar link in documents (#107412) As discussion in https://discourse.llvm.org/t/clang-tidy-rfc-add-autosar-c-14-clang-tidy-module/59223/12. We should not link clang-tidy check with AUTOSAR rules. --- .../docs/clang-tidy/checks/misc/const-correctness.rst | 3 +-- .../clang-tidy/checks/misc/unconventional-assign-operator.rst | 3 --- .../checks/readability/avoid-nested-conditional-operator.rst | 3 --- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst index 86fba6c7e4f7c..8ac1ad56bc8cf 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst @@ -7,8 +7,7 @@ This check implements detection of local variables which could be declared as ``const`` but are not. Declaring variables as ``const`` is required or recommended by many coding guidelines, such as: `ES.25 `_ -from the C++ Core Guidelines and `AUTOSAR C++14 Rule A7-1-1 (6.7.1 Specifiers) -`_. +from the C++ Core Guidelines. Please note that this check's analysis is type-based only. Variables that are not modified but used to create a non-const handle that might escape the scope are not diagnosed diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst index 3b4b65a5cb683..49e3fd5b6ee42 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst @@ -13,6 +13,3 @@ types and definitions with good return type but wrong ``return`` statements. type (e.g. ``int``). * Private and deleted operators are ignored. * The operator must always return ``*this``. - -This check implements `AUTOSAR C++14 Rule A13-2-1 -`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst index 44b74283292ce..cd3906855d497 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst @@ -16,6 +16,3 @@ Examples: int NestInConditional = (condition1 ? true1 : false1) ? true2 : false2; int NestInTrue = condition1 ? (condition2 ? true1 : false1) : false2; int NestInFalse = condition1 ? true1 : condition2 ? true2 : false1; - -This check implements part of `AUTOSAR C++14 Rule A5-16-1 -`_. From c8834527b729c8c89f453d215e667047fd948aa1 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Thu, 5 Sep 2024 18:07:36 -0500 Subject: [PATCH 310/425] [TOSA] Move CreateOpAndInfer into ConversionUtils.h (#106122) This moves CreateOpAndInfer from TF legalize_util.h into ConversionUtils.h also removed duplicate createOpAndInfer function from TosaDecomposeTransposeConv.cpp Renamed to CreateOpAndInferShape so we can upstream this independently of tensorflow (otherwise a redefinition error would break TF compile if not upstreamed together with removal of CreateOpAndInfer in TF) --------- Signed-off-by: Tai Ly --- .../mlir/Dialect/Tosa/Utils/ConversionUtils.h | 137 ++++++++++++++++++ .../Transforms/TosaDecomposeTransposeConv.cpp | 93 +++--------- .../Dialect/Tosa/Utils/ConversionUtils.cpp | 12 +- 3 files changed, 169 insertions(+), 73 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h index ceab7d9c628a5..ef40b348ab549 100644 --- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h +++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h @@ -15,7 +15,9 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tosa/Utils/ShapeUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" +#include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/PatternMatch.h" #include @@ -79,6 +81,141 @@ checkHasDynamicBatchDims(PatternRewriter &rewriter, Op op, LogicalResult EqualizeRanks(PatternRewriter &rewriter, Location loc, Value &input1, Value &input2); +LogicalResult EqualizeRanks(ImplicitLocOpBuilder &builder, Value &input1, + Value &input2); + +namespace { + +// Creates a TOSA operation and performs shape inference on the individual +// op. This allows shape inference when lowering down to TOSA. +template +TosaOp createOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy, + Args &&...args) { + auto op = builder.create(resultTy, args...); + + InferShapedTypeOpInterface shapeInterface = + dyn_cast(op.getOperation()); + if (!shapeInterface) + return op; + + SmallVector returnedShapes; + if (shapeInterface + .inferReturnTypeComponents(op.getContext(), builder.getLoc(), + op->getOperands(), op->getAttrDictionary(), + op->getPropertiesStorage(), + op->getRegions(), returnedShapes) + .failed()) + return op; + + // We need to use the element type of the existing result type to generate + // the new result shaped type. This is because rescale can include a cast to + // different bit-width types and does not have a TypeAttr to define the + // target type. + auto result = op->getResult(0); + auto predictedShape = returnedShapes[0]; + auto currentKnowledge = ValueKnowledge::getKnowledgeFromType(resultTy); + + // Compute the knowledge based on the inferred type. + auto inferredKnowledge = ValueKnowledge::getPessimisticValueState(); + inferredKnowledge.dtype = mlir::cast(resultTy).getElementType(); + inferredKnowledge.hasRank = predictedShape.hasRank(); + if (predictedShape.hasRank()) { + for (auto dim : predictedShape.getDims()) { + inferredKnowledge.sizes.push_back(dim); + } + } + + // Compute the new type based on the joined version. + auto newKnowledge = ValueKnowledge::join(currentKnowledge, inferredKnowledge); + Type newTy = + newKnowledge.hasRank + ? Type{mlir::RankedTensorType::get(llvm::ArrayRef(newKnowledge.sizes), + newKnowledge.dtype)} + : Type{mlir::UnrankedTensorType::get(newKnowledge.dtype)}; + result.setType(newTy); + return op; +} + +} // namespace + +// Creates a TOSA operation by: +// - first equalize ranks for ops with SameOperandsAndResultRank trait +// - create operator +// - performs shape inference on this operator +template +TosaOp CreateOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy, + Args &&...args) { + if (TosaOp::template hasTrait()) { + // op requires same ranks for tensor operands + if constexpr (sizeof...(Args) == 2) { + auto argX = std::get<0>(std::tie(args...)); + auto argY = std::get<1>(std::tie(args...)); + using ArgX = decltype(argX); + using ArgY = decltype(argY); + if constexpr (std::is_same_v && + std::is_same_v) { + Value x = std::get<0>(std::tie(args...)); + Value y = std::get<1>(std::tie(args...)); + if (EqualizeRanks(builder, x, y).failed()) { + // incompatible broadcast shapes, no reshape is inserted + // ResultsBroadcastableShape verify will handle this + } + return createOpAndInferShape(builder, resultTy, x, y); + } + } + if constexpr (sizeof...(Args) == 3) { + auto argX = std::get<0>(std::tie(args...)); + auto argY = std::get<1>(std::tie(args...)); + auto argZ = std::get<2>(std::tie(args...)); + using ArgX = decltype(argX); + using ArgY = decltype(argY); + using ArgZ = decltype(argZ); + if constexpr (std::is_same_v && + std::is_same_v && std::is_same_v) { + // special case for ArithmeticRightShiftOp + Value x = std::get<0>(std::tie(args...)); + Value y = std::get<1>(std::tie(args...)); + bool round = std::get<2>(std::tie(args...)); + if (EqualizeRanks(builder, x, y).failed()) { + // incompatible broadcast shapes, no reshape is inserted + // ResultsBroadcastableShape verify will handle this + } + return createOpAndInferShape(builder, resultTy, x, y, round); + } + if constexpr (std::is_same_v && + std::is_same_v && + std::is_same_v) { + // special case for Select + Value x = std::get<0>(std::tie(args...)); + Value y = std::get<1>(std::tie(args...)); + Value z = std::get<2>(std::tie(args...)); + + if (EqualizeRanks(builder, x, y).failed() || + EqualizeRanks(builder, x, z).failed() || + EqualizeRanks(builder, y, z).failed()) { + // incompatible broadcast shapes, no reshape is inserted + // ResultsBroadcastableShape verify will handle this + } + + return createOpAndInferShape(builder, resultTy, x, y, z); + } + } + } + + return createOpAndInferShape(builder, resultTy, args...); +} + +// Creates a TOSA operation by: +// - first equalize ranks for ops with SameOperandsAndResultRank trait +// - create operator +// - performs shape inference on this operator +template +TosaOp CreateOpAndInferShape(PatternRewriter &rewriter, Location loc, + Type resultTy, Args &&...args) { + ImplicitLocOpBuilder builder(loc, rewriter); + return CreateOpAndInferShape(builder, resultTy, args...); +} + } // namespace tosa } // namespace mlir diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp index a94bb3a920b1d..0779cdb9667a1 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp @@ -26,53 +26,6 @@ using namespace mlir::tosa; namespace { -template -TosaOp createOpAndInfer(PatternRewriter &rewriter, Location loc, Type resultTy, - Args &&...args) { - auto op = rewriter.create(loc, resultTy, args...); - - InferShapedTypeOpInterface shapeInterface = - dyn_cast(op.getOperation()); - if (!shapeInterface) - return op; - - SmallVector returnedShapes; - if (shapeInterface - .inferReturnTypeComponents( - op.getContext(), op.getLoc(), op->getOperands(), - op->getDiscardableAttrDictionary(), op->getPropertiesStorage(), - op->getRegions(), returnedShapes) - .failed()) - return op; - - // We need to use the element type of the existing result type to generate - // the new result shaped type. This is because rescale can include a cast to - // different bit-width types and does not have a TypeAttr to define the - // target type. - auto result = op->getResult(0); - auto predictedShape = returnedShapes[0]; - auto currentKnowledge = - mlir::tosa::ValueKnowledge::getKnowledgeFromType(resultTy); - - // Compute the knowledge based on the inferred type. - auto inferredKnowledge = - mlir::tosa::ValueKnowledge::getPessimisticValueState(); - inferredKnowledge.dtype = cast(resultTy).getElementType(); - inferredKnowledge.hasRank = predictedShape.hasRank(); - if (predictedShape.hasRank()) { - for (auto dim : predictedShape.getDims()) { - inferredKnowledge.sizes.push_back(dim); - } - } - - // Compute the new type based on the joined version. - auto newKnowledge = - mlir::tosa::ValueKnowledge::join(currentKnowledge, inferredKnowledge); - auto newTy = newKnowledge.getType(); - result.setType(newTy); - return op; -} - class TransposeConvNonStridedConverter : public OpRewritePattern { public: @@ -187,20 +140,20 @@ class TransposeConvStridedConverter (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0; DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get( RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding); - Value weightPaddingVal = createOpAndInfer( + Value weightPaddingVal = CreateOpAndInferShape( rewriter, loc, weightPaddingAttr.getType(), weightPaddingAttr); if (op.getQuantizationInfo().has_value()) { auto quantInfo = op.getQuantizationInfo().value(); - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, weightPaddingVal, nullptr, rewriter.getAttr(quantInfo.getWeightZp())); } else { - weight = createOpAndInfer(rewriter, loc, - UnrankedTensorType::get(weightETy), - weight, weightPaddingVal); + weight = CreateOpAndInferShape( + rewriter, loc, UnrankedTensorType::get(weightETy), weight, + weightPaddingVal); } weightTy = cast(weight.getType()); @@ -212,7 +165,7 @@ class TransposeConvStridedConverter outputChannels, weightHeight / stride[0], stride[0], weightWidth / stride[1], stride[1], inputChannels}; - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, rewriter.getDenseI64ArrayAttr(weightReshapeDims0)); @@ -221,7 +174,7 @@ class TransposeConvStridedConverter loc, RankedTensorType::get({6}, rewriter.getI32Type()), rewriter.getI32TensorAttr({2, 4, 0, 1, 3, 5})); - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, transposeWeightVal); @@ -229,15 +182,15 @@ class TransposeConvStridedConverter llvm::SmallVector weightReshapeDims1 = { outputChannels * stride[0] * stride[1], weightHeight / stride[0], weightWidth / stride[1], inputChannels}; - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, rewriter.getDenseI64ArrayAttr(weightReshapeDims1)); ShapedType restridedWeightTy = cast(weight.getType()); - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, /* axis = */ rewriter.getI32IntegerAttr(1)); - weight = createOpAndInfer( + weight = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(weightETy), weight, /* axis = */ rewriter.getI32IntegerAttr(2)); @@ -251,19 +204,19 @@ class TransposeConvStridedConverter DenseElementsAttr inputPaddingAttr = DenseIntElementsAttr::get( RankedTensorType::get({4, 2}, rewriter.getI32Type()), inputPadding); - Value inputPaddingVal = createOpAndInfer( + Value inputPaddingVal = CreateOpAndInferShape( rewriter, loc, inputPaddingAttr.getType(), inputPaddingAttr); if (op.getQuantizationInfo().has_value()) { auto quantInfo = op.getQuantizationInfo().value(); - input = createOpAndInfer( + input = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(inputETy), input, inputPaddingVal, nullptr, rewriter.getAttr(quantInfo.getInputZp())); } else { - input = createOpAndInfer(rewriter, loc, - UnrankedTensorType::get(inputETy), - input, inputPaddingVal); + input = CreateOpAndInferShape( + rewriter, loc, UnrankedTensorType::get(inputETy), input, + inputPaddingVal); } // We use a zero bias as we need to broadcast the bias. @@ -279,7 +232,7 @@ class TransposeConvStridedConverter // Perform the convolution using the zero bias. Value conv2d; if (op.getQuantizationInfo()) { - conv2d = createOpAndInfer( + conv2d = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), input, weight, zeroBias, /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}), @@ -288,7 +241,7 @@ class TransposeConvStridedConverter *op.getQuantizationInfo()) .getResult(); } else { - conv2d = createOpAndInfer( + conv2d = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), input, weight, zeroBias, /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}), @@ -307,7 +260,7 @@ class TransposeConvStridedConverter // Factor striding out of the convolution result. llvm::SmallVector convReshapeDims0 = { batch, convHeight, convWidth, stride[0], stride[1], outputChannels}; - conv2d = createOpAndInfer( + conv2d = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), conv2d, rewriter.getDenseI64ArrayAttr(convReshapeDims0)); @@ -316,14 +269,14 @@ class TransposeConvStridedConverter loc, RankedTensorType::get({6}, rewriter.getI32Type()), rewriter.getI32TensorAttr({0, 1, 3, 2, 4, 5})); - conv2d = createOpAndInfer( + conv2d = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(convETy), conv2d, transposeConvVal); // Fuse striding behavior back into width / height. llvm::SmallVector convReshapeDims1 = { batch, convHeight * stride[0], convWidth * stride[1], outputChannels}; - conv2d = createOpAndInfer( + conv2d = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), conv2d, rewriter.getDenseI64ArrayAttr(convReshapeDims1)); @@ -348,7 +301,7 @@ class TransposeConvStridedConverter sliceSize[1] = resultSliceHeight; sliceSize[2] = resultSliceWidth; - auto slice = createOpAndInfer( + auto slice = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), conv2d, rewriter.getDenseI64ArrayAttr(sliceBegin), rewriter.getDenseI64ArrayAttr(sliceSize)) @@ -363,10 +316,10 @@ class TransposeConvStridedConverter DenseElementsAttr resultPaddingAttr = DenseIntElementsAttr::get( RankedTensorType::get({4, 2}, rewriter.getI32Type()), resultPadding); - Value resultPaddingVal = createOpAndInfer( + Value resultPaddingVal = CreateOpAndInferShape( rewriter, loc, resultPaddingAttr.getType(), resultPaddingAttr); - Value resultPad = createOpAndInfer( + Value resultPad = CreateOpAndInferShape( rewriter, loc, UnrankedTensorType::get(resultETy), slice, resultPaddingVal); diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp index f276924a8a9f6..1f6e3b2ab8391 100644 --- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp +++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp @@ -102,6 +102,12 @@ computeReshapeOutput(ArrayRef higherRankShape, LogicalResult mlir::tosa::EqualizeRanks(PatternRewriter &rewriter, Location loc, Value &input1, Value &input2) { + ImplicitLocOpBuilder builder(loc, rewriter); + return EqualizeRanks(builder, input1, input2); +} + +LogicalResult mlir::tosa::EqualizeRanks(ImplicitLocOpBuilder &builder, + Value &input1, Value &input2) { auto input1Ty = llvm::dyn_cast(input1.getType()); auto input2Ty = llvm::dyn_cast(input2.getType()); @@ -140,9 +146,9 @@ LogicalResult mlir::tosa::EqualizeRanks(PatternRewriter &rewriter, Location loc, auto reshapeOutputType = RankedTensorType::get( ArrayRef(reshapeOutputShape), reshapeInputType.getElementType()); - auto reshapeLower = rewriter.create( - loc, reshapeOutputType, lowerTensorValue, - rewriter.getDenseI64ArrayAttr(reshapeOutputShape)); + auto reshapeLower = builder.create( + reshapeOutputType, lowerTensorValue, + builder.getDenseI64ArrayAttr(reshapeOutputShape)); if (input1Rank > input2Rank) { input1 = higherTensorValue; From 67fb8d15c993f5695cf944b16022a9ee49b9252d Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 16:11:08 -0700 Subject: [PATCH 311/425] [lldb] Convert ProcessDebugger.cpp to new Status API (NFC) --- .../source/Plugins/Process/Windows/Common/ProcessDebugger.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp index 9fc8077851d25..bde72d61b0fee 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp @@ -560,7 +560,7 @@ void ProcessDebugger::OnDebuggerError(const Status &error, uint32_t type) { // If we haven't actually launched the process yet, this was an error // launching the process. Set the internal error and signal the initial // stop event so that the DoLaunch method wakes up and returns a failure. - m_session_data->m_launch_error = error; + m_session_data->m_launch_error = error.Clone(); ::SetEvent(m_session_data->m_initial_stop_event); LLDB_LOG(log, "Error {0} occurred launching the process before the initial " @@ -582,7 +582,7 @@ Status ProcessDebugger::WaitForDebuggerConnection(DebuggerThreadSP debugger, LLDB_LOG(log, "hit loader breakpoint, returning."); process = debugger->GetProcess(); - return m_session_data->m_launch_error; + return m_session_data->m_launch_error.Clone(); } else return Status(::GetLastError(), eErrorTypeWin32); } From e0a93d3505bf6b4c87e819db7a871e0ce4d4100c Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 16:12:11 -0700 Subject: [PATCH 312/425] [lldb] Convert ProcessWindows.cpp to new Status API (NFC) --- lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp | 2 +- lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp index 6f2f6633021d3..9a5fcd88e1282 100644 --- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp @@ -57,7 +57,7 @@ Status DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) { "lldb.plugin.process-windows.secondary[?]", [this, launch_info] { return DebuggerThreadLaunchRoutine(launch_info); }); if (!secondary_thread) { - result = Status(secondary_thread.takeError()); + result = Status::FromError(secondary_thread.takeError()); LLDB_LOG(log, "couldn't launch debugger thread. {0}", result); } diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 688b363bbb2db..b25068dda53ff 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -821,7 +821,7 @@ void ProcessWindows::OnDebuggerError(const Status &error, uint32_t type) { // If we haven't actually launched the process yet, this was an error // launching the process. Set the internal error and signal the initial // stop event so that the DoLaunch method wakes up and returns a failure. - m_session_data->m_launch_error = error; + m_session_data->m_launch_error = error.Clone(); ::SetEvent(m_session_data->m_initial_stop_event); LLDB_LOG( log, From 1be9a80768a03ea9bd2bfbb03762b2bc3c350007 Mon Sep 17 00:00:00 2001 From: Kai Sasaki Date: Fri, 6 Sep 2024 08:21:47 +0900 Subject: [PATCH 313/425] [mlir][affine] Fix the crash due to the simultaneous replacement store (#90829) `AffineScalarReplacement` should forward the memref store op to load op only if the store op reaches the load. But it now checks the reachability only if these ops are in the same block, which causes the crash reported in https://github.com/llvm/llvm-project/issues/76309. We need to check the reachability even if they are both in the same block, which rescues the case where consecutive store operations are written before the load op. --- mlir/lib/Dialect/Affine/Utils/Utils.cpp | 3 +-- mlir/test/Dialect/Affine/scalrep.mlir | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp index f46381403bc52..898467d573362 100644 --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -860,8 +860,7 @@ static void forwardStoreToLoad( // 3. The store must reach the load. Access function equivalence only // guarantees this for accesses in the same block. The load could be in a // nested block that is unreachable. - if (storeOp->getBlock() != loadOp->getBlock() && - !mustReachAtInnermost(srcAccess, destAccess)) + if (!mustReachAtInnermost(srcAccess, destAccess)) continue; // 4. Ensure there is no intermediate operation which could replace the diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir index 4a99dee50a280..fdfe3bfb62f95 100644 --- a/mlir/test/Dialect/Affine/scalrep.mlir +++ b/mlir/test/Dialect/Affine/scalrep.mlir @@ -5,6 +5,7 @@ // CHECK-DAG: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d1)> // CHECK-DAG: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 - 1)> // CHECK-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> +// CHECK-DAG: [[$IDENT:#map[0-9]*]] = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @simple_store_load() { func.func @simple_store_load() { @@ -931,3 +932,23 @@ func.func @cross_block() { %69 = affine.load %alloc_99[%c10] : memref<13xi1> return } + +#map1 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @consecutive_store +func.func @consecutive_store() { + // CHECK: %[[CST:.*]] = arith.constant + %tmp = arith.constant 1.1 : f16 + // CHECK: %[[ALLOC:.*]] = memref.alloc + %alloc_66 = memref.alloc() : memref + affine.for %arg2 = 4 to 6 { + affine.for %arg3 = #map1(%arg2) to #map1(%arg2) step 4 { + // CHECK: affine.store %[[CST]], %[[ALLOC]][] + affine.store %tmp, %alloc_66[] : memref + // CHECK-NOT: affine.store %[[CST]], %[[ALLOC]][] + affine.store %tmp, %alloc_66[] : memref + %270 = affine.load %alloc_66[] : memref + } + } + return +} From d1756165a9066f907b88d51dd8e3ffee15a8cc1e Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:36:21 -0700 Subject: [PATCH 314/425] [lld-macho][arm64] Enhance safe ICF with thunk-based deduplication (#106573) Currently, our `safe` ICF mode only merges non-address-significant code, leaving duplicate address-significant functions in the output. This patch introduces `safe_thunks` ICF mode, which keeps a single master copy of each function and replaces address-significant duplicates with thunks that branch to the master copy. Currently `--icf=safe_thunks` is only supported for `arm64` architectures. **Perf stats for a large binary:** | ICF Option | Total Size | __text Size | __unwind_info | % total | |-------------------|------------|-------------|---------------------|---------------------------| | `--icf=none` | 91.738 MB | 55.220 MB | 1.424 MB | 0% | | `--icf=safe` | 85.042 MB | 49.572 MB | 1.168 MB | 7.30% | | `--icf=safe_thunks` | 84.650 MB | 49.219 MB | 1.143 MB | 7.72% | | `--icf=all` | 82.060 MB | 48.726 MB | 1.111 MB | 10.55% | So overall we can expect a `~0.45%` binary size reduction for a typical large binary compared to the `--icf=safe` option. **Runtime:** Linking the above binary took ~10 seconds. Comparing the link performance of --icf=safe_thunks vs --icf=safe, a ~2% slowdown was observed. --- lld/MachO/Arch/ARM64.cpp | 23 +++ lld/MachO/Config.h | 1 + lld/MachO/Driver.cpp | 9 +- lld/MachO/ICF.cpp | 92 ++++++++++- lld/MachO/InputSection.cpp | 5 +- lld/MachO/InputSection.h | 3 +- lld/MachO/MapFile.cpp | 2 +- lld/MachO/Symbols.cpp | 2 +- lld/MachO/Symbols.h | 13 +- lld/MachO/SyntheticSections.cpp | 3 +- lld/MachO/Target.h | 10 ++ lld/test/MachO/icf-safe-thunks.ll | 254 ++++++++++++++++++++++++++++++ 12 files changed, 405 insertions(+), 12 deletions(-) create mode 100644 lld/test/MachO/icf-safe-thunks.ll diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp index e192676394c96..195a8f09f47c1 100644 --- a/lld/MachO/Arch/ARM64.cpp +++ b/lld/MachO/Arch/ARM64.cpp @@ -41,6 +41,10 @@ struct ARM64 : ARM64Common { Symbol *objcMsgSend) const override; void populateThunk(InputSection *thunk, Symbol *funcSym) override; void applyOptimizationHints(uint8_t *, const ObjFile &) const override; + + void initICFSafeThunkBody(InputSection *thunk, + InputSection *branchTarget) const override; + uint32_t getICFSafeThunkSize() const override; }; } // namespace @@ -175,6 +179,25 @@ void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) { /*offset=*/0, /*addend=*/0, /*referent=*/funcSym); } +// Just a single direct branch to the target function. +static constexpr uint32_t icfSafeThunkCode[] = { + 0x14000000, // 08: b target +}; + +void ARM64::initICFSafeThunkBody(InputSection *thunk, + InputSection *branchTarget) const { + // The base data here will not be itself modified, we'll just be adding a + // reloc below. So we can directly use the constexpr above as the data. + thunk->data = {reinterpret_cast(icfSafeThunkCode), + sizeof(icfSafeThunkCode)}; + + thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_BRANCH26, + /*pcrel=*/true, /*length=*/2, + /*offset=*/0, /*addend=*/0, + /*referent=*/branchTarget); +} + +uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); } ARM64::ARM64() : ARM64Common(LP64()) { cpuType = CPU_TYPE_ARM64; diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 5beb0662ba727..4e940693602c9 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -68,6 +68,7 @@ enum class ICFLevel { unknown, none, safe, + safe_thunks, all, }; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 09a539d71dab3..fa5a5dab8b565 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -847,8 +847,14 @@ static ICFLevel getICFLevel(const ArgList &args) { auto icfLevel = StringSwitch(icfLevelStr) .Cases("none", "", ICFLevel::none) .Case("safe", ICFLevel::safe) + .Case("safe_thunks", ICFLevel::safe_thunks) .Case("all", ICFLevel::all) .Default(ICFLevel::unknown); + + if ((icfLevel == ICFLevel::safe_thunks) && (config->arch() != AK_arm64)) { + error("--icf=safe_thunks is only supported on arm64 targets"); + } + if (icfLevel == ICFLevel::unknown) { warn(Twine("unknown --icf=OPTION `") + icfLevelStr + "', defaulting to `none'"); @@ -2116,7 +2122,8 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, // foldIdenticalLiterals before foldIdenticalSections. foldIdenticalLiterals(); if (config->icfLevel != ICFLevel::none) { - if (config->icfLevel == ICFLevel::safe) + if (config->icfLevel == ICFLevel::safe || + config->icfLevel == ICFLevel::safe_thunks) markAddrSigSymbols(); foldIdenticalSections(/*onlyCfStrings=*/false); } else if (config->dedupStrings) { diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp index fc786b571dc64..2ff962b06e367 100644 --- a/lld/MachO/ICF.cpp +++ b/lld/MachO/ICF.cpp @@ -45,6 +45,7 @@ class ICF { const ConcatInputSection *ib); bool equalsVariable(const ConcatInputSection *ia, const ConcatInputSection *ib); + void applySafeThunksToRange(size_t begin, size_t end); // ICF needs a copy of the inputs vector because its equivalence-class // segregation algorithm destroys the proper sequence. @@ -251,6 +252,50 @@ void ICF::forEachClassRange(size_t begin, size_t end, } } +// Given a range of identical icfInputs, replace address significant functions +// with a thunk that is just a direct branch to the first function in the +// series. This way we keep only one main body of the function but we still +// retain the address uniqueness of relevant functions by having them be a +// direct branch thunk rather than containing a full copy of the actual function +// body. +void ICF::applySafeThunksToRange(size_t begin, size_t end) { + // If the functions we're dealing with are smaller than the thunk size, then + // just leave them all as-is - creating thunks would be a net loss. + uint32_t thunkSize = target->getICFSafeThunkSize(); + if (icfInputs[begin]->data.size() <= thunkSize) + return; + + // When creating a unique ICF thunk, use the first section as the section that + // all thunks will branch to. + ConcatInputSection *masterIsec = icfInputs[begin]; + + for (size_t i = begin + 1; i < end; ++i) { + ConcatInputSection *isec = icfInputs[i]; + // When we're done processing keepUnique entries, we can stop. Sorting + // guaratees that all keepUnique will be at the front. + if (!isec->keepUnique) + break; + + ConcatInputSection *thunk = + makeSyntheticInputSection(isec->getSegName(), isec->getName()); + addInputSection(thunk); + + target->initICFSafeThunkBody(thunk, masterIsec); + thunk->foldIdentical(isec, Symbol::ICFFoldKind::Thunk); + + // Since we're folding the target function into a thunk, we need to adjust + // the symbols that now got relocated from the target function to the thunk. + // Since the thunk is only one branch, we move all symbols to offset 0 and + // make sure that the size of all non-zero-size symbols is equal to the size + // of the branch. + for (auto *sym : thunk->symbols) { + sym->value = 0; + if (sym->size != 0) + sym->size = thunkSize; + } + } +} + // Split icfInputs into shards, then parallelize invocation of FUNC on subranges // with matching equivalence class void ICF::forEachClass(llvm::function_ref func) { @@ -312,6 +357,12 @@ void ICF::run() { llvm::stable_sort( icfInputs, [](const ConcatInputSection *a, const ConcatInputSection *b) { + // When using safe_thunks, ensure that we first sort by icfEqClass and + // then by keepUnique (descending). This guarantees that within an + // equivalence class, the keepUnique inputs are always first. + if (config->icfLevel == ICFLevel::safe_thunks) + if (a->icfEqClass[0] == b->icfEqClass[0]) + return a->keepUnique > b->keepUnique; return a->icfEqClass[0] < b->icfEqClass[0]; }); forEachClass([&](size_t begin, size_t end) { @@ -331,13 +382,37 @@ void ICF::run() { log("equalsVariable() called " + Twine(equalsVariableCount) + " times"); } + // When using safe_thunks, we need to create thunks for all keepUnique + // functions that can be deduplicated. Since we're creating / adding new + // InputSections, we can't paralellize this. + if (config->icfLevel == ICFLevel::safe_thunks) + forEachClassRange(0, icfInputs.size(), [&](size_t begin, size_t end) { + applySafeThunksToRange(begin, end); + }); + // Fold sections within equivalence classes forEachClass([&](size_t begin, size_t end) { if (end - begin < 2) return; + bool useSafeThunks = config->icfLevel == ICFLevel::safe_thunks; + + // For ICF level safe_thunks, replace keepUnique function bodies with + // thunks. For all other ICF levles, directly merge the functions. + ConcatInputSection *beginIsec = icfInputs[begin]; - for (size_t i = begin + 1; i < end; ++i) + for (size_t i = begin + 1; i < end; ++i) { + // Skip keepUnique inputs when using safe_thunks (already handeled above) + if (useSafeThunks && icfInputs[i]->keepUnique) { + // Assert keepUnique sections are either small or replaced with thunks. + assert(!icfInputs[i]->live || + icfInputs[i]->data.size() <= target->getICFSafeThunkSize()); + assert(!icfInputs[i]->replacement || + icfInputs[i]->replacement->data.size() == + target->getICFSafeThunkSize()); + continue; + } beginIsec->foldIdentical(icfInputs[i]); + } }); } @@ -421,11 +496,22 @@ void macho::foldIdenticalSections(bool onlyCfStrings) { // can still fold it. bool hasFoldableFlags = (isSelRefsSection(isec) || sectionType(isec->getFlags()) == MachO::S_REGULAR); + + bool isCodeSec = isCodeSection(isec); + + // When keepUnique is true, the section is not foldable. Unless we are at + // icf level safe_thunks, in which case we still want to fold code sections. + // When using safe_thunks we'll apply the safe_thunks logic at merge time + // based on the 'keepUnique' flag. + bool noUniqueRequirement = + !isec->keepUnique || + ((config->icfLevel == ICFLevel::safe_thunks) && isCodeSec); + // FIXME: consider non-code __text sections as foldable? bool isFoldable = (!onlyCfStrings || isCfStringSection(isec)) && - (isCodeSection(isec) || isFoldableWithAddendsRemoved || + (isCodeSec || isFoldableWithAddendsRemoved || isGccExceptTabSection(isec)) && - !isec->keepUnique && !isec->hasAltEntry && + noUniqueRequirement && !isec->hasAltEntry && !isec->shouldOmitFromOutput() && hasFoldableFlags; if (isFoldable) { foldable.push_back(isec); diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp index a9b93e07a6013..64c584920defb 100644 --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -190,13 +190,14 @@ const Reloc *InputSection::getRelocAt(uint32_t off) const { return &*it; } -void ConcatInputSection::foldIdentical(ConcatInputSection *copy) { +void ConcatInputSection::foldIdentical(ConcatInputSection *copy, + Symbol::ICFFoldKind foldKind) { align = std::max(align, copy->align); copy->live = false; copy->wasCoalesced = true; copy->replacement = this; for (auto ©Sym : copy->symbols) - copySym->wasIdenticalCodeFolded = true; + copySym->identicalCodeFoldingKind = foldKind; symbols.insert(symbols.end(), copy->symbols.begin(), copy->symbols.end()); copy->symbols.clear(); diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h index 0f389e50425a3..4e238d8ef7779 100644 --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -117,7 +117,8 @@ class ConcatInputSection final : public InputSection { bool shouldOmitFromOutput() const { return !live || isCoalescedWeak(); } void writeTo(uint8_t *buf); - void foldIdentical(ConcatInputSection *redundant); + void foldIdentical(ConcatInputSection *redundant, + Symbol::ICFFoldKind foldKind = Symbol::ICFFoldKind::Body); ConcatInputSection *canonical() override { return replacement ? replacement : this; } diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp index 5bcaeca48da2a..9c0621622ae2f 100644 --- a/lld/MachO/MapFile.cpp +++ b/lld/MachO/MapFile.cpp @@ -156,7 +156,7 @@ static void printNonLazyPointerSection(raw_fd_ostream &os, } static uint64_t getSymSizeForMap(Defined *sym) { - if (sym->wasIdenticalCodeFolded) + if (sym->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body) return 0; return sym->size; } diff --git a/lld/MachO/Symbols.cpp b/lld/MachO/Symbols.cpp index f52da4f48aafb..9faf01e09de05 100644 --- a/lld/MachO/Symbols.cpp +++ b/lld/MachO/Symbols.cpp @@ -60,7 +60,7 @@ Defined::Defined(StringRef name, InputFile *file, InputSection *isec, bool interposable) : Symbol(DefinedKind, name, file), overridesWeakDef(canOverrideWeakDef), privateExtern(isPrivateExtern), includeInSymtab(includeInSymtab), - wasIdenticalCodeFolded(false), + identicalCodeFoldingKind(ICFFoldKind::None), referencedDynamically(isReferencedDynamically), noDeadStrip(noDeadStrip), interposable(interposable), weakDefCanBeHidden(isWeakDefCanBeHidden), weakDef(isWeakDef), external(isExternal), originalIsec(isec), diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h index bd60226f38ad3..70fd195f25c92 100644 --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -33,6 +33,15 @@ class Symbol { AliasKind, }; + // Enum that describes the type of Identical Code Folding (ICF) applied to a + // symbol. This information is crucial for accurately representing symbol + // sizes in the map file. + enum ICFFoldKind { + None, // No folding is applied. + Body, // The entire body (function or data) is folded. + Thunk // The function body is folded into a single branch thunk. + }; + virtual ~Symbol() {} Kind kind() const { return symbolKind; } @@ -142,8 +151,8 @@ class Defined : public Symbol { bool privateExtern : 1; // Whether this symbol should appear in the output symbol table. bool includeInSymtab : 1; - // Whether this symbol was folded into a different symbol during ICF. - bool wasIdenticalCodeFolded : 1; + // The ICF folding kind of this symbol: None / Body / Thunk. + ICFFoldKind identicalCodeFoldingKind : 2; // Symbols marked referencedDynamically won't be removed from the output's // symbol table by tools like strip. In theory, this could be set on arbitrary // symbols in input object files. In practice, it's used solely for the diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 939e9b286d77f..7b0078856c5e2 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -1231,7 +1231,8 @@ void SymtabSection::emitStabs() { // Constant-folded symbols go in the executable's symbol table, but don't // get a stabs entry unless --keep-icf-stabs flag is specified - if (!config->keepICFStabs && defined->wasIdenticalCodeFolded) + if (!config->keepICFStabs && + defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body) continue; ObjFile *file = defined->getObjectFile(); diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h index cc47ae4386b47..eaa0336e70cb6 100644 --- a/lld/MachO/Target.h +++ b/lld/MachO/Target.h @@ -74,6 +74,16 @@ class TargetInfo { uint64_t selrefVA, Symbol *objcMsgSend) const = 0; + // Init 'thunk' so that it be a direct jump to 'branchTarget'. + virtual void initICFSafeThunkBody(InputSection *thunk, + InputSection *branchTarget) const { + llvm_unreachable("target does not support ICF safe thunks"); + } + + virtual uint32_t getICFSafeThunkSize() const { + llvm_unreachable("target does not support ICF safe thunks"); + } + // Symbols may be referenced via either the GOT or the stubs section, // depending on the relocation type. prepareSymbolRelocation() will set up the // GOT/stubs entries, and resolveSymbolVA() will return the addresses of those diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll new file mode 100644 index 0000000000000..238e90f952e16 --- /dev/null +++ b/lld/test/MachO/icf-safe-thunks.ll @@ -0,0 +1,254 @@ +; REQUIRES: aarch64 + +; RUN: rm -rf %t; mkdir %t +; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj-safe-thunks.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig +; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/icf-safe.dylib -map %t/icf-safe.map %t/icf-obj-safe-thunks.o +; RUN: llvm-objdump %t/icf-safe.dylib -d --macho | FileCheck %s --check-prefixes=CHECK-ARM64 +; RUN: cat %t/icf-safe.map | FileCheck %s --check-prefixes=CHECK-ARM64-MAP + +; CHECK-ARM64: (__TEXT,__text) section +; CHECK-ARM64-NEXT: _func_unique_1: +; CHECK-ARM64-NEXT: mov {{.*}}, #0x1 +; +; CHECK-ARM64: _func_unique_2_canmerge: +; CHECK-ARM64-NEXT: _func_2identical_v1: +; CHECK-ARM64-NEXT: mov {{.*}}, #0x2 +; +; CHECK-ARM64: _func_3identical_v1: +; CHECK-ARM64-NEXT: mov {{.*}}, #0x3 +; +; CHECK-ARM64: _func_3identical_v1_canmerge: +; CHECK-ARM64-NEXT: _func_3identical_v2_canmerge: +; CHECK-ARM64-NEXT: _func_3identical_v3_canmerge: +; CHECK-ARM64-NEXT: mov {{.*}}, #0x21 +; +; CHECK-ARM64: _call_all_funcs: +; CHECK-ARM64-NEXT: stp x29 +; +; CHECK-ARM64: _take_func_addr: +; CHECK-ARM64-NEXT: adr +; +; CHECK-ARM64: _func_2identical_v2: +; CHECK-ARM64-NEXT: b _func_2identical_v1 +; CHECK-ARM64-NEXT: _func_3identical_v2: +; CHECK-ARM64-NEXT: b _func_3identical_v1 +; CHECK-ARM64-NEXT: _func_3identical_v3: +; CHECK-ARM64-NEXT: b _func_3identical_v1 + + +; CHECK-ARM64-MAP: 0x00000010 [ 2] _func_unique_1 +; CHECK-ARM64-MAP-NEXT: 0x00000010 [ 2] _func_2identical_v1 +; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_unique_2_canmerge +; CHECK-ARM64-MAP-NEXT: 0x00000010 [ 2] _func_3identical_v1 +; CHECK-ARM64-MAP-NEXT: 0x00000010 [ 2] _func_3identical_v1_canmerge +; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_3identical_v2_canmerge +; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_3identical_v3_canmerge +; CHECK-ARM64-MAP-NEXT: 0x00000034 [ 2] _call_all_funcs +; CHECK-ARM64-MAP-NEXT: 0x00000050 [ 2] _take_func_addr +; CHECK-ARM64-MAP-NEXT: 0x00000004 [ 2] _func_2identical_v2 +; CHECK-ARM64-MAP-NEXT: 0x00000004 [ 2] _func_3identical_v2 +; CHECK-ARM64-MAP-NEXT: 0x00000004 [ 2] _func_3identical_v3 + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx11.0.0" + +@g_val = global i8 0, align 1 +@g_ptr = global ptr null, align 8 + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_unique_1() #0 { +entry: + store volatile i8 1, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_unique_2_canmerge() local_unnamed_addr #0 { +entry: + store volatile i8 2, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_2identical_v1() #0 { +entry: + store volatile i8 2, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_2identical_v2() #0 { +entry: + store volatile i8 2, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v1() #0 { +entry: + store volatile i8 3, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v2() #0 { +entry: + store volatile i8 3, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v3() #0 { +entry: + store volatile i8 3, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v1_canmerge() local_unnamed_addr #0 { +entry: + store volatile i8 33, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v2_canmerge() local_unnamed_addr #0 { +entry: + store volatile i8 33, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_3identical_v3_canmerge() local_unnamed_addr #0 { +entry: + store volatile i8 33, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) +define void @call_all_funcs() local_unnamed_addr #1 { +entry: + tail call void @func_unique_1() + tail call void @func_unique_2_canmerge() + tail call void @func_2identical_v1() + tail call void @func_2identical_v2() + tail call void @func_3identical_v1() + tail call void @func_3identical_v2() + tail call void @func_3identical_v3() + tail call void @func_3identical_v1_canmerge() + tail call void @func_3identical_v2_canmerge() + tail call void @func_3identical_v3_canmerge() + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @take_func_addr() local_unnamed_addr #0 { +entry: + store volatile ptr @func_unique_1, ptr @g_ptr, align 8, !tbaa !8 + store volatile ptr @func_2identical_v1, ptr @g_ptr, align 8, !tbaa !8 + store volatile ptr @func_2identical_v2, ptr @g_ptr, align 8, !tbaa !8 + store volatile ptr @func_3identical_v1, ptr @g_ptr, align 8, !tbaa !8 + store volatile ptr @func_3identical_v2, ptr @g_ptr, align 8, !tbaa !8 + store volatile ptr @func_3identical_v3, ptr @g_ptr, align 8, !tbaa !8 + ret void +} + +attributes #0 = { mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" } +attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{i32 7, !"frame-pointer", i32 1} +!4 = !{!"clang"} +!5 = !{!6, !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} +!8 = !{!9, !9, i64 0} +!9 = !{!"any pointer", !6, i64 0} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;; Generate the above LLVM IR with the below script ;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; #!/bin/bash +; set -ex +; TOOLCHAIN_BIN="llvm-project/build/Debug/bin" +; +; # Create icf-safe-thunks.cpp file +; cat > icf-safe-thunks.cpp < Date: Thu, 5 Sep 2024 23:41:18 +0000 Subject: [PATCH 315/425] [SandboxIR] Add bazel support (#107486) --- .../bazel/llvm-project-overlay/llvm/BUILD.bazel | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b2dcc696b0ad0..b574397e108b7 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -1446,6 +1446,20 @@ cc_library( ], ) +cc_library( + name = "SandboxIR", + srcs = glob([ + "lib/SandboxIR/*.cpp", + ]), + hdrs = glob(["include/llvm/SandboxIR/*.h"]), + textual_hdrs = ["include/llvm/SandboxIR/SandboxIRValues.def"], + copts = llvm_copts, + deps = [ + ":Core", + ":Support", + ], +) + cc_library( name = "Scalar", srcs = glob([ @@ -1474,14 +1488,17 @@ cc_library( srcs = glob([ "lib/Transforms/Vectorize/*.cpp", "lib/Transforms/Vectorize/*.h", + "lib/Transforms/Vectorize/SandboxVectorizer/*.cpp", ]), hdrs = glob([ "include/llvm/Transforms/Vectorize/*.h", + "include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" ]), copts = llvm_copts, deps = [ ":Analysis", ":Core", + ":SandboxIR", ":Scalar", ":Support", ":Target", From d7dd2c468fecae871ba67e891a3519c758c94b63 Mon Sep 17 00:00:00 2001 From: ziqingluo-90 Date: Thu, 5 Sep 2024 16:18:50 -0700 Subject: [PATCH 316/425] Re-land "[-Wunsafe-buffer-usage] Warning Libc functions (#101583)" Revert commit 23457964392d00fc872fa6021763859024fb38da, and re-land with a new flag "-Wunsafe-buffer-usage-in-libc-call" for the new warning. (rdar://117182250) --- .../Analysis/Analyses/UnsafeBufferUsage.h | 19 + .../Analyses/UnsafeBufferUsageGadgets.def | 13 +- clang/include/clang/Basic/DiagnosticGroups.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td | 7 + clang/lib/Analysis/UnsafeBufferUsage.cpp | 529 +++++++++++++++++- clang/lib/Sema/AnalysisBasedWarnings.cpp | 23 +- ...-usage-libc-functions-inline-namespace.cpp | 60 ++ ...arn-unsafe-buffer-usage-libc-functions.cpp | 124 ++++ ...n-unsafe-buffer-usage-test-unreachable.cpp | 4 +- 9 files changed, 766 insertions(+), 16 deletions(-) create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h index 228b4ae1e3e11..267cde64f8f23 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h @@ -15,6 +15,7 @@ #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H #include "clang/AST/Decl.h" +#include "clang/AST/Expr.h" #include "clang/AST/Stmt.h" #include "clang/Basic/SourceLocation.h" #include "llvm/Support/Debug.h" @@ -106,6 +107,20 @@ class UnsafeBufferUsageHandler { virtual void handleUnsafeOperation(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) = 0; + /// Invoked when a call to an unsafe libc function is found. + /// \param PrintfInfo + /// is 0 if the callee function is not a member of the printf family; + /// is 1 if the callee is `sprintf`; + /// is 2 if arguments of the call have `__size_by` relation but are not in a + /// safe pattern; + /// is 3 if string arguments do not guarantee null-termination + /// is 4 if the callee takes va_list + /// \param UnsafeArg one of the actual arguments that is unsafe, non-null + /// only when `2 <= PrintfInfo <= 3` + virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, + ASTContext &Ctx, + const Expr *UnsafeArg = nullptr) = 0; + /// Invoked when an unsafe operation with a std container is found. virtual void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, @@ -151,6 +166,10 @@ class UnsafeBufferUsageHandler { virtual bool ignoreUnsafeBufferInContainer(const SourceLocation &Loc) const = 0; + /// \return true iff unsafe libc call should NOT be reported at `Loc` + virtual bool + ignoreUnsafeBufferInLibcCall(const SourceLocation &Loc) const = 0; + virtual std::string getUnsafeBufferUsageAttributeTextAt(SourceLocation Loc, StringRef WSSuffix = "") const = 0; diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def index 242ad763ba62b..09fa510bc0472 100644 --- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def +++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def @@ -18,10 +18,10 @@ #define WARNING_GADGET(name) GADGET(name) #endif -/// A `WARNING_GADGET` subset, where the code pattern of each gadget -/// corresponds uses of a (possibly hardened) contatiner (e.g., `std::span`). -#ifndef WARNING_CONTAINER_GADGET -#define WARNING_CONTAINER_GADGET(name) WARNING_GADGET(name) +/// A `WARNING_GADGET` subset, each of which may be enable/disable separately +/// with different flags +#ifndef WARNING_OPTIONAL_GADGET +#define WARNING_OPTIONAL_GADGET(name) WARNING_GADGET(name) #endif /// Safe gadgets correspond to code patterns that aren't unsafe but need to be @@ -38,7 +38,8 @@ WARNING_GADGET(PointerArithmetic) WARNING_GADGET(UnsafeBufferUsageAttr) WARNING_GADGET(UnsafeBufferUsageCtorAttr) WARNING_GADGET(DataInvocation) -WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)` +WARNING_OPTIONAL_GADGET(UnsafeLibcFunctionCall) +WARNING_OPTIONAL_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)` FIXABLE_GADGET(ULCArraySubscript) // `DRE[any]` in an Unspecified Lvalue Context FIXABLE_GADGET(DerefSimplePtrArithFixable) FIXABLE_GADGET(PointerDereference) @@ -52,5 +53,5 @@ FIXABLE_GADGET(PointerInit) #undef FIXABLE_GADGET #undef WARNING_GADGET -#undef WARNING_CONTAINER_GADGET +#undef WARNING_OPTIONAL_GADGET #undef GADGET diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index c4c29942ee1cb..116ce7a04f66f 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1558,7 +1558,8 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">; // Warnings and fixes to support the "safe buffers" programming model. def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">; -def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer]>; +def UnsafeBufferUsageInLibcCall : DiagGroup<"unsafe-buffer-usage-in-libc-call">; +def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer, UnsafeBufferUsageInLibcCall]>; // Warnings and notes related to the function effects system underlying // the nonblocking and nonallocating attributes. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 72ea5338ce615..083684670a980 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12416,6 +12416,13 @@ def warn_unsafe_buffer_operation : Warning< "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|" "field %1 prone to unsafe buffer manipulation}0">, InGroup, DefaultIgnore; +def warn_unsafe_buffer_libc_call : Warning< + "function %0 is unsafe">, + InGroup, DefaultIgnore; +def note_unsafe_buffer_printf_call : Note< + "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match" + "|string argument is not guaranteed to be null-terminated" + "|'va_list' is unsafe}0">; def note_unsafe_buffer_operation : Note< "used%select{| in pointer arithmetic| in buffer access}0 here">; def note_unsafe_buffer_variable_fixit_group : Note< diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index da7446913f7c8..ec76eae8b6077 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -10,12 +10,12 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/FormatString.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Stmt.h" #include "clang/AST/StmtVisitor.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" -#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" @@ -247,6 +247,11 @@ AST_MATCHER_P(Stmt, ignoreUnsafeBufferInContainer, return Handler->ignoreUnsafeBufferInContainer(Node.getBeginLoc()); } +AST_MATCHER_P(Stmt, ignoreUnsafeLibcCall, const UnsafeBufferUsageHandler *, + Handler) { + return Handler->ignoreUnsafeBufferInLibcCall(Node.getBeginLoc()); +} + AST_MATCHER_P(CastExpr, castSubExpr, internal::Matcher, innerMatcher) { return innerMatcher.matches(*Node.getSubExpr(), Finder, Builder); } @@ -443,6 +448,425 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) { return false; } +AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) { + return Node.getNumArgs() == Num; +} + +namespace libc_func_matchers { +// Under `libc_func_matchers`, define a set of matchers that match unsafe +// functions in libc and unsafe calls to them. + +// A tiny parser to strip off common prefix and suffix of libc function names +// in real code. +// +// Given a function name, `matchName` returns `CoreName` according to the +// following grammar: +// +// LibcName := CoreName | CoreName + "_s" +// MatchingName := "__builtin_" + LibcName | +// "__builtin___" + LibcName + "_chk" | +// "__asan_" + LibcName +// +struct LibcFunNamePrefixSuffixParser { + StringRef matchName(StringRef FunName, bool isBuiltin) { + // Try to match __builtin_: + if (isBuiltin && FunName.starts_with("__builtin_")) + // Then either it is __builtin_LibcName or __builtin___LibcName_chk or + // no match: + return matchLibcNameOrBuiltinChk( + FunName.drop_front(10 /* truncate "__builtin_" */)); + // Try to match __asan_: + if (FunName.starts_with("__asan_")) + return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */)); + return matchLibcName(FunName); + } + + // Parameter `Name` is the substring after stripping off the prefix + // "__builtin_". + StringRef matchLibcNameOrBuiltinChk(StringRef Name) { + if (Name.starts_with("__") && Name.ends_with("_chk")) + return matchLibcName( + Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */); + return matchLibcName(Name); + } + + StringRef matchLibcName(StringRef Name) { + if (Name.ends_with("_s")) + return Name.drop_back(2 /* truncate "_s" */); + return Name; + } +}; + +// A pointer type expression is known to be null-terminated, if it has the +// form: E.c_str(), for any expression E of `std::string` type. +static bool isNullTermPointer(const Expr *Ptr) { + if (isa(Ptr->IgnoreParenImpCasts())) + return true; + if (isa(Ptr->IgnoreParenImpCasts())) + return true; + if (auto *MCE = dyn_cast(Ptr->IgnoreParenImpCasts())) { + const CXXMethodDecl *MD = MCE->getMethodDecl(); + const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl(); + + if (MD && RD && RD->isInStdNamespace()) + if (MD->getName() == "c_str" && RD->getName() == "basic_string") + return true; + } + return false; +} + +// Return true iff at least one of following cases holds: +// 1. Format string is a literal and there is an unsafe pointer argument +// corresponding to an `s` specifier; +// 2. Format string is not a literal and there is least an unsafe pointer +// argument (including the formatter argument). +// +// `UnsafeArg` is the output argument that will be set only if this function +// returns true. +static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg, + const unsigned FmtArgIdx, ASTContext &Ctx, + bool isKprintf = false) { + class StringFormatStringHandler + : public analyze_format_string::FormatStringHandler { + const CallExpr *Call; + unsigned FmtArgIdx; + const Expr *&UnsafeArg; + + public: + StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx, + const Expr *&UnsafeArg) + : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {} + + bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS, + const char *startSpecifier, + unsigned specifierLen, + const TargetInfo &Target) override { + if (FS.getConversionSpecifier().getKind() == + analyze_printf::PrintfConversionSpecifier::sArg) { + unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx; + + if (0 < ArgIdx && ArgIdx < Call->getNumArgs()) + if (!isNullTermPointer(Call->getArg(ArgIdx))) { + UnsafeArg = Call->getArg(ArgIdx); // output + // returning false stops parsing immediately + return false; + } + } + return true; // continue parsing + } + }; + + const Expr *Fmt = Call->getArg(FmtArgIdx); + + if (auto *SL = dyn_cast(Fmt->IgnoreParenImpCasts())) { + StringRef FmtStr = SL->getString(); + StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg); + + return analyze_format_string::ParsePrintfString( + Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(), + Ctx.getTargetInfo(), isKprintf); + } + // If format is not a string literal, we cannot analyze the format string. + // In this case, this call is considered unsafe if at least one argument + // (including the format argument) is unsafe pointer. + return llvm::any_of( + llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()), + [&UnsafeArg](const Expr *Arg) -> bool { + if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) { + UnsafeArg = Arg; + return true; + } + return false; + }); +} + +// Matches a FunctionDecl node such that +// 1. It's name, after stripping off predefined prefix and suffix, is +// `CoreName`; and +// 2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which +// is a set of libc function names. +// +// Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`. +// The notation `CoreName[str/wcs]` means a new name obtained from replace +// string "wcs" with "str" in `CoreName`. +AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) { + static std::unique_ptr> PredefinedNames = nullptr; + if (!PredefinedNames) + PredefinedNames = + std::make_unique, std::set>({ + // numeric conversion: + "atof", + "atoi", + "atol", + "atoll", + "strtol", + "strtoll", + "strtoul", + "strtoull", + "strtof", + "strtod", + "strtold", + "strtoimax", + "strtoumax", + // "strfromf", "strfromd", "strfroml", // C23? + // string manipulation: + "strcpy", + "strncpy", + "strlcpy", + "strcat", + "strncat", + "strlcat", + "strxfrm", + "strdup", + "strndup", + // string examination: + "strlen", + "strnlen", + "strcmp", + "strncmp", + "stricmp", + "strcasecmp", + "strcoll", + "strchr", + "strrchr", + "strspn", + "strcspn", + "strpbrk", + "strstr", + "strtok", + // "mem-" functions + "memchr", + "wmemchr", + "memcmp", + "wmemcmp", + "memcpy", + "memccpy", + "mempcpy", + "wmemcpy", + "memmove", + "wmemmove", + "memset", + "wmemset", + // IO: + "fread", + "fwrite", + "fgets", + "fgetws", + "gets", + "fputs", + "fputws", + "puts", + // others + "strerror_s", + "strerror_r", + "bcopy", + "bzero", + "bsearch", + "qsort", + }); + + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + // Match predefined names: + if (PredefinedNames->find(Name) != PredefinedNames->end()) + return true; + + std::string NameWCS = Name.str(); + size_t WcsPos = NameWCS.find("wcs"); + + while (WcsPos != std::string::npos) { + NameWCS[WcsPos++] = 's'; + NameWCS[WcsPos++] = 't'; + NameWCS[WcsPos++] = 'r'; + WcsPos = NameWCS.find("wcs", WcsPos); + } + if (PredefinedNames->find(NameWCS) != PredefinedNames->end()) + return true; + // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They + // all should end with "scanf"): + return Name.ends_with("scanf"); +} + +// Match a call to one of the `v*printf` functions taking `va_list`. We cannot +// check safety for these functions so they should be changed to their +// non-va_list versions. +AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf")) + return false; // neither printf nor scanf + return Name.starts_with("v"); +} + +// Matches a call to one of the `sprintf` functions as they are always unsafe +// and should be changed to `snprintf`. +AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf") || + // Let `isUnsafeVaListPrintfFunc` check for cases with va-list: + Name.starts_with("v")) + return false; + + StringRef Prefix = Name.drop_back(6); + + if (Prefix.ends_with("w")) + Prefix = Prefix.drop_back(1); + return Prefix == "s"; +} + +// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide +// character versions. Calls to these functions can be safe if their arguments +// are carefully made safe. +AST_MATCHER(FunctionDecl, isNormalPrintfFunc) { + auto *II = Node.getIdentifier(); + + if (!II) + return false; + + StringRef Name = LibcFunNamePrefixSuffixParser().matchName( + II->getName(), Node.getBuiltinID()); + + if (!Name.ends_with("printf") || Name.starts_with("v")) + return false; + + StringRef Prefix = Name.drop_back(6); + + if (Prefix.ends_with("w")) + Prefix = Prefix.drop_back(1); + + return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn"; +} + +// This matcher requires that it is known that the callee `isNormalPrintf`. +// Then if the format string is a string literal, this matcher matches when at +// least one string argument is unsafe. If the format is not a string literal, +// this matcher matches when at least one pointer type argument is unsafe. +AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg, + clang::ast_matchers::internal::Matcher, + UnsafeStringArgMatcher) { + // Determine what printf it is: + const Expr *FirstArg = Node.getArg(0); + ASTContext &Ctx = Finder->getASTContext(); + + if (isa(FirstArg->IgnoreParenImpCasts())) { + // It is a printf/kprintf. And, the format is a string literal: + bool isKprintf = false; + const Expr *UnsafeArg; + + if (auto *Callee = Node.getDirectCallee()) + if (auto *II = Callee->getIdentifier()) + isKprintf = II->getName() == "kprintf"; + if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + + QualType PtrTy = FirstArg->getType(); + + assert(PtrTy->isPointerType()); + + QualType PteTy = (cast(PtrTy))->getPointeeType(); + + if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext, + there can't be any file pointer then */ + && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) { + // It is a fprintf: + const Expr *UnsafeArg; + + if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + + const Expr *SecondArg = Node.getArg(1); + + if (SecondArg->getType()->isIntegerType()) { + // It is a snprintf: + const Expr *UnsafeArg; + + if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false)) + return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder); + return false; + } + // It is printf but the format string is passed by pointer. The only thing we + // can do is to require all pointers to be null-terminated: + for (auto Arg : Node.arguments()) + if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) + if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder)) + return true; + return false; +} + +// This matcher requires that it is known that the callee `isNormalPrintf`. +// Then it matches if the first two arguments of the call is a pointer and an +// integer and they are not in a safe pattern. +// +// For the first two arguments: `ptr` and `size`, they are safe if in the +// following patterns: +// ptr := DRE.data(); +// size:= DRE.size()/DRE.size_bytes() +// And DRE is a hardened container or view. +AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) { + if (Node.getNumArgs() < 3) + return false; // not an snprintf call + + const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1); + + if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType()) + return false; // not an snprintf call + + static StringRef SizedObjs[] = {"span", "array", "vector", + "basic_string_view", "basic_string"}; + Buf = Buf->IgnoreParenImpCasts(); + Size = Size->IgnoreParenImpCasts(); + if (auto *MCEPtr = dyn_cast(Buf)) + if (auto *MCESize = dyn_cast(Size)) { + auto *DREOfPtr = dyn_cast( + MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts()); + auto *DREOfSize = dyn_cast( + MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts()); + + if (!DREOfPtr || !DREOfSize) + return true; // not in safe pattern + if (DREOfPtr->getDecl() != DREOfSize->getDecl()) + return true; // not in safe pattern + if (MCEPtr->getMethodDecl()->getName() != "data") + return true; // not in safe pattern + + if (MCESize->getMethodDecl()->getName() == "size_bytes" || + // Note here the pointer must be a pointer-to-char type unless there + // is explicit casting. If there is explicit casting, this branch + // is unreachable. Thus, at this branch "size" and "size_bytes" are + // equivalent as the pointer is a char pointer: + MCESize->getMethodDecl()->getName() == "size") + for (StringRef SizedObj : SizedObjs) + if (MCEPtr->getRecordDecl()->isInStdNamespace() && + MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() == + SizedObj) + return false; // It is in fact safe + } + return true; // ptr and size are not in safe pattern +} +} // namespace libc_func_matchers } // namespace clang::ast_matchers namespace { @@ -760,6 +1184,10 @@ class SpanTwoParamConstructorGadget : public WarningGadget { .bind(SpanTwoParamConstructorTag)); } + static Matcher matcher(const UnsafeBufferUsageHandler *Handler) { + return stmt(unless(ignoreUnsafeBufferInContainer(Handler)), matcher()); + } + void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler, bool IsRelatedToDecl, ASTContext &Ctx) const override { @@ -1030,6 +1458,98 @@ class DataInvocationGadget : public WarningGadget { DeclUseList getClaimedVarUseSites() const override { return {}; } }; +class UnsafeLibcFunctionCallGadget : public WarningGadget { + const CallExpr *const Call; + const Expr *UnsafeArg = nullptr; + constexpr static const char *const Tag = "UnsafeLibcFunctionCall"; + // Extra tags for additional information: + constexpr static const char *const UnsafeSprintfTag = + "UnsafeLibcFunctionCall_sprintf"; + constexpr static const char *const UnsafeSizedByTag = + "UnsafeLibcFunctionCall_sized_by"; + constexpr static const char *const UnsafeStringTag = + "UnsafeLibcFunctionCall_string"; + constexpr static const char *const UnsafeVaListTag = + "UnsafeLibcFunctionCall_va_list"; + + enum UnsafeKind { + OTHERS = 0, // no specific information, the callee function is unsafe + SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead. + SIZED_BY = + 2, // the first two arguments of `snprintf` function have + // "__sized_by" relation but they do not conform to safe patterns + STRING = 3, // an argument is a pointer-to-char-as-string but does not + // guarantee null-termination + VA_LIST = 4, // one of the `-printf`s function that take va_list, which is + // considered unsafe as it is not compile-time check + } WarnedFunKind = OTHERS; + +public: + UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result) + : WarningGadget(Kind::UnsafeLibcFunctionCall), + Call(Result.Nodes.getNodeAs(Tag)) { + if (Result.Nodes.getNodeAs(UnsafeSprintfTag)) + WarnedFunKind = SPRINTF; + else if (auto *E = Result.Nodes.getNodeAs(UnsafeStringTag)) { + WarnedFunKind = STRING; + UnsafeArg = E; + } else if (Result.Nodes.getNodeAs(UnsafeSizedByTag)) { + WarnedFunKind = SIZED_BY; + UnsafeArg = Call->getArg(0); + } else if (Result.Nodes.getNodeAs(UnsafeVaListTag)) + WarnedFunKind = VA_LIST; + } + + static Matcher matcher(const UnsafeBufferUsageHandler *Handler) { + return stmt(unless(ignoreUnsafeLibcCall(Handler)), + anyOf( + callExpr( + callee(functionDecl(anyOf( + // Match a predefined unsafe libc + // function: + functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()), + // Match a call to one of the `v*printf` functions + // taking va-list, which cannot be checked at + // compile-time: + functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc()) + .bind(UnsafeVaListTag), + // Match a call to a `sprintf` function, which is never + // safe: + functionDecl(libc_func_matchers::isUnsafeSprintfFunc()) + .bind(UnsafeSprintfTag)))), + // (unless the call has a sole string literal argument): + unless( + allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))), + + // The following two cases require checking against actual + // arguments of the call: + + // Match a call to an `snprintf` function. And first two + // arguments of the call (that describe a buffer) are not in + // safe patterns: + callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), + libc_func_matchers::hasUnsafeSnprintfBuffer()) + .bind(UnsafeSizedByTag), + // Match a call to a `printf` function, which can be safe if + // all arguments are null-terminated: + callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())), + libc_func_matchers::hasUnsafePrintfStringArg( + expr().bind(UnsafeStringTag))))); + } + + const Stmt *getBaseStmt() const { return Call; } + + SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); } + + void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler, + bool IsRelatedToDecl, + ASTContext &Ctx) const override { + Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg); + } + + DeclUseList getClaimedVarUseSites() const override { return {}; } +}; + // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue // Context (see `isInUnspecifiedLvalueContext`). // Note here `[]` is the built-in subscript operator. @@ -1452,10 +1972,9 @@ findGadgets(const Decl *D, const UnsafeBufferUsageHandler &Handler, #define WARNING_GADGET(x) \ allOf(x ## Gadget::matcher().bind(#x), \ notInSafeBufferOptOut(&Handler)), -#define WARNING_CONTAINER_GADGET(x) \ - allOf(x ## Gadget::matcher().bind(#x), \ - notInSafeBufferOptOut(&Handler), \ - unless(ignoreUnsafeBufferInContainer(&Handler))), +#define WARNING_OPTIONAL_GADGET(x) \ + allOf(x ## Gadget::matcher(&Handler).bind(#x), \ + notInSafeBufferOptOut(&Handler)), #include "clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def" // Avoid a hanging comma. unless(stmt()) diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index e6ce89dc7ec40..117b2c8bc5793 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2304,6 +2304,20 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { } } + void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo, + ASTContext &Ctx, + const Expr *UnsafeArg = nullptr) override { + S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call) + << Call->getDirectCallee() // We've checked there is a direct callee + << Call->getSourceRange(); + if (PrintfInfo > 0) { + SourceRange R = + UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange(); + S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call) + << PrintfInfo << R; + } + } + void handleUnsafeOperationInContainer(const Stmt *Operation, bool IsRelatedToDecl, ASTContext &Ctx) override { @@ -2382,6 +2396,10 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { return S.Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container, Loc); } + bool ignoreUnsafeBufferInLibcCall(const SourceLocation &Loc) const override { + return S.Diags.isIgnored(diag::warn_unsafe_buffer_libc_call, Loc); + } + // Returns the text representation of clang::unsafe_buffer_usage attribute. // `WSSuffix` holds customized "white-space"s, e.g., newline or whilespace // characters. @@ -2548,6 +2566,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( !Diags.isIgnored(diag::warn_unsafe_buffer_variable, Node->getBeginLoc()) || !Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container, + Node->getBeginLoc()) || + !Diags.isIgnored(diag::warn_unsafe_buffer_libc_call, Node->getBeginLoc())) { clang::checkUnsafeBufferUsage(Node, R, UnsafeBufferUsageShouldEmitSuggestions); @@ -2560,7 +2580,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( if (!Diags.isIgnored(diag::warn_unsafe_buffer_operation, SourceLocation()) || !Diags.isIgnored(diag::warn_unsafe_buffer_variable, SourceLocation()) || !Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container, - SourceLocation())) { + SourceLocation()) || + !Diags.isIgnored(diag::warn_unsafe_buffer_libc_call, SourceLocation())) { CallableVisitor(CallAnalyzers).TraverseTranslationUnitDecl(TU); } } diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp new file mode 100644 index 0000000000000..2bd12db93fd52 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp @@ -0,0 +1,60 @@ +// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ +// RUN: -verify %s + +namespace std { + inline namespace __1 { + template< class InputIt, class OutputIt > + OutputIt copy( InputIt first, InputIt last, + OutputIt d_first ); + + struct iterator{}; + template + struct span { + T * ptr; + T * data(); + unsigned size_bytes(); + unsigned size(); + iterator begin() const noexcept; + iterator end() const noexcept; + }; + + template + struct basic_string { + T* p; + T *c_str(); + T *data(); + unsigned size_bytes(); + }; + + typedef basic_string string; + typedef basic_string wstring; + + // C function under std: + void memcpy(); + void strcpy(); + int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); + } +} + +void f(char * p, char * q, std::span s) { + std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + std::__1::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::__1::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + + /* Test printfs */ + std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn + std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn +} + +void v(std::string s1) { + std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn + std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn +} + +void g(char *begin, char *end, char *p, std::span s) { + std::copy(begin, end, p); // no warn + std::copy(s.begin(), s.end(), s.begin()); // no warn +} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp new file mode 100644 index 0000000000000..0438f71b1c792 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp @@ -0,0 +1,124 @@ +// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \ +// RUN: -verify %s +// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage-in-libc-call \ +// RUN: -verify %s + +typedef struct {} FILE; +void memcpy(); +void __asan_memcpy(); +void strcpy(); +void strcpy_s(); +void wcscpy_s(); +unsigned strlen( const char* str ); +int fprintf( FILE* stream, const char* format, ... ); +int printf( const char* format, ... ); +int sprintf( char* buffer, const char* format, ... ); +int swprintf( char* buffer, const char* format, ... ); +int snprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int snwprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... ); +int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... ); +int sscanf_s(const char * buffer, const char * format, ...); +int sscanf(const char * buffer, const char * format, ... ); + +namespace std { + template< class InputIt, class OutputIt > + OutputIt copy( InputIt first, InputIt last, + OutputIt d_first ); + + struct iterator{}; + template + struct span { + T * ptr; + T * data(); + unsigned size_bytes(); + unsigned size(); + iterator begin() const noexcept; + iterator end() const noexcept; + }; + + template + struct basic_string { + T* p; + T *c_str(); + T *data(); + unsigned size_bytes(); + }; + + typedef basic_string string; + typedef basic_string wstring; + + // C function under std: + void memcpy(); + void strcpy(); +} + +void f(char * p, char * q, std::span s, std::span s2) { + memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + std::memcpy(); // expected-warning{{function 'memcpy' is unsafe}} + __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}} + __builtin___memcpy_chk(p, q, 8, 64); // expected-warning{{function '__builtin___memcpy_chk' is unsafe}} + __asan_memcpy(); // expected-warning{{function '__asan_memcpy' is unsafe}} + strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + std::strcpy(); // expected-warning{{function 'strcpy' is unsafe}} + strcpy_s(); // expected-warning{{function 'strcpy_s' is unsafe}} + wcscpy_s(); // expected-warning{{function 'wcscpy_s' is unsafe}} + + + /* Test printfs */ + fprintf((FILE*)p, "%s%d", p, *p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} + printf("%s%d", // expected-warning{{function 'printf' is unsafe}} + p, // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument + *p); + sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} + swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}} + snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}} + snwprintf_s( // expected-warning{{function 'snwprintf_s' is unsafe}} + s.data(), // expected-note{{buffer pointer and size may not match}} // note attached to the buffer + s2.size(), + "%s%d", "hello", *p); + vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}} + sscanf(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf' is unsafe}} + sscanf_s(p, "%s%d", "hello", *p); // expected-warning{{function 'sscanf_s' is unsafe}} + fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}} + fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn + printf("%s%d", "hello", *p); // no warn + snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn + snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn + strlen("hello");// no warn +} + +void v(std::string s1, int *p) { + snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn + printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn + fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn + fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str()); // no warn +} + + +void g(char *begin, char *end, char *p, std::span s) { + std::copy(begin, end, p); // no warn + std::copy(s.begin(), s.end(), s.begin()); // no warn +} + +// warning gets turned off +void ff(char * p, char * q, std::span s, std::span s2) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunsafe-buffer-usage-in-libc-call" + memcpy(); + std::memcpy(); + __builtin_memcpy(p, q, 64); + __builtin___memcpy_chk(p, q, 8, 64); + __asan_memcpy(); + strcpy(); + std::strcpy(); + strcpy_s(); + wcscpy_s(); +#pragma clang diagnostic pop +} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp index 844311c3a51a5..989931e41c0cc 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp @@ -1,8 +1,6 @@ // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s -// expected-no-diagnostics - typedef unsigned __darwin_size_t; typedef __darwin_size_t size_t; #define bzero(s, n) __builtin_bzero(s, n) -void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } +void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}} From a84baef74892dc294eb65bb2a1ea2339be13e8b2 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 5 Sep 2024 20:37:22 -0400 Subject: [PATCH 317/425] [gn build] Port 52dca6ffae08 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 8114c88e665e6..79ba8e3afdf40 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -4,6 +4,7 @@ static_library("Vectorize") { "//llvm/lib/Analysis", "//llvm/lib/IR", "//llvm/lib/Support", + "//llvm/lib/SandboxIR", "//llvm/lib/Transforms/Utils", ] sources = [ @@ -12,6 +13,7 @@ static_library("Vectorize") { "LoopVectorizationLegality.cpp", "LoopVectorize.cpp", "SLPVectorizer.cpp", + "SandboxVectorizer/SandboxVectorizer.cpp", "VPlan.cpp", "VPlanAnalysis.cpp", "VPlanHCFGBuilder.cpp", From 2949720c2e55d2695682d6412d5afe45b167cb1e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 6 Sep 2024 08:59:13 +0800 Subject: [PATCH 318/425] [RISCV] Move vmerge same mask peephole to RISCVVectorPeephole (#106108) We currently fold a vmerge.vvm into its true operand if the true operand is a masked pseudo with the same mask. We can move this over to RISCVVectorPeephole by instead splitting it up into a smaller peephole which converts it to a vmv.v.v first. The existing foldVMV_V_V peephole will then take care of folding it if needed. This is very similar to the existing all-ones mask peephole and we could potentially do it inside of it. I opted to put it in a separate peephole to make it easier to reason about, given that the duplication is small, but I could be persuaded either way. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 42 ++--------- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 74 ++++++++++++++++--- .../fixed-vectors-strided-load-store-asm.ll | 22 +++--- .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir | 70 ++++++++++++++++++ 4 files changed, 148 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 4580f3191d138..ff4c0e9bbd50e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3833,15 +3833,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { uint64_t TrueTSFlags = TrueMCID.TSFlags; bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(TrueMCID); - bool IsMasked = false; const RISCV::RISCVMaskedPseudoInfo *Info = RISCV::lookupMaskedIntrinsicByUnmasked(TrueOpc); - if (!Info && HasTiedDest) { - Info = RISCV::getMaskedPseudoInfo(TrueOpc); - IsMasked = true; - } - assert(!(IsMasked && !HasTiedDest) && "Expected tied dest"); - if (!Info) return false; @@ -3853,19 +3846,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { return false; } - // If True is masked then the vmerge must have either the same mask or an all - // 1s mask, since we're going to keep the mask from True. - if (IsMasked) { - // FIXME: Support mask agnostic True instruction which would have an - // undef passthru operand. - SDValue TrueMask = - getMaskSetter(True->getOperand(Info->MaskOpIdx), - True->getOperand(True->getNumOperands() - 1)); - assert(TrueMask); - if (!usesAllOnesMask(Mask, Glue) && getMaskSetter(Mask, Glue) != TrueMask) - return false; - } - // Skip if True has side effect. if (TII->get(TrueOpc).hasUnmodeledSideEffects()) return false; @@ -3930,24 +3910,13 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { (Mask && !usesAllOnesMask(Mask, Glue))) return false; - // If we end up changing the VL or mask of True, then we need to make sure it - // doesn't raise any observable fp exceptions, since changing the active - // elements will affect how fflags is set. - if (TrueVL != VL || !IsMasked) - if (mayRaiseFPException(True.getNode()) && - !True->getFlags().hasNoFPExcept()) - return false; + // Make sure it doesn't raise any observable fp exceptions, since changing the + // active elements will affect how fflags is set. + if (mayRaiseFPException(True.getNode()) && !True->getFlags().hasNoFPExcept()) + return false; SDLoc DL(N); - // From the preconditions we checked above, we know the mask and thus glue - // for the result node will be taken from True. - if (IsMasked) { - Mask = True->getOperand(Info->MaskOpIdx); - Glue = True->getOperand(True->getNumOperands() - 1); - assert(Glue.getValueType() == MVT::Glue); - } - unsigned MaskedOpc = Info->MaskedPseudo; #ifndef NDEBUG const MCInstrDesc &MaskedMCID = TII->get(MaskedOpc); @@ -3977,8 +3946,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { Ops.push_back(False); const bool HasRoundingMode = RISCVII::hasRoundModeOp(TrueTSFlags); - const unsigned NormalOpsEnd = TrueVLIndex - IsMasked - HasRoundingMode; - assert(!IsMasked || NormalOpsEnd == Info->MaskOpIdx); + const unsigned NormalOpsEnd = TrueVLIndex - HasRoundingMode; Ops.append(True->op_begin() + HasTiedDest, True->op_begin() + NormalOpsEnd); Ops.push_back(Mask); diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index db8e496493c41..48ea2a0187617 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -65,7 +65,8 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool convertToVLMAX(MachineInstr &MI) const; bool convertToWholeRegister(MachineInstr &MI) const; bool convertToUnmasked(MachineInstr &MI) const; - bool convertVMergeToVMv(MachineInstr &MI) const; + bool convertAllOnesVMergeToVMv(MachineInstr &MI) const; + bool convertSameMaskVMergeToVMv(MachineInstr &MI) const; bool foldUndefPassthruVMV_V_V(MachineInstr &MI); bool foldVMV_V_V(MachineInstr &MI); @@ -342,17 +343,13 @@ bool RISCVVectorPeephole::convertToWholeRegister(MachineInstr &MI) const { return true; } -// Transform (VMERGE_VVM_ pt, false, true, allones, vl, sew) to -// (VMV_V_V_ pt, true, vl, sew). It may decrease uses of VMSET. -bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const { +static unsigned getVMV_V_VOpcodeForVMERGE_VVM(const MachineInstr &MI) { #define CASE_VMERGE_TO_VMV(lmul) \ case RISCV::PseudoVMERGE_VVM_##lmul: \ - NewOpc = RISCV::PseudoVMV_V_V_##lmul; \ - break; - unsigned NewOpc; + return RISCV::PseudoVMV_V_V_##lmul; switch (MI.getOpcode()) { default: - return false; + return 0; CASE_VMERGE_TO_VMV(MF8) CASE_VMERGE_TO_VMV(MF4) CASE_VMERGE_TO_VMV(MF2) @@ -361,14 +358,68 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const { CASE_VMERGE_TO_VMV(M4) CASE_VMERGE_TO_VMV(M8) } +} +/// Convert a PseudoVMERGE_VVM with an all ones mask to a PseudoVMV_V_V. +/// +/// %x = PseudoVMERGE_VVM %passthru, %false, %true, %allones, sew, vl +/// -> +/// %x = PseudoVMV_V_V %passthru, %true, vl, sew, tu_mu +bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { + unsigned NewOpc = getVMV_V_VOpcodeForVMERGE_VVM(MI); + if (!NewOpc) + return false; assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0); if (!isAllOnesMask(V0Defs.lookup(&MI))) return false; MI.setDesc(TII->get(NewOpc)); - MI.removeOperand(2); // False operand - MI.removeOperand(3); // Mask operand + MI.removeOperand(2); // False operand + MI.removeOperand(3); // Mask operand + MI.addOperand( + MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED)); + + // vmv.v.v doesn't have a mask operand, so we may be able to inflate the + // register class for the destination and passthru operands e.g. VRNoV0 -> VR + MRI->recomputeRegClass(MI.getOperand(0).getReg()); + if (MI.getOperand(1).getReg() != RISCV::NoRegister) + MRI->recomputeRegClass(MI.getOperand(1).getReg()); + return true; +} + +/// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the +/// same mask, and the masked pseudo's passthru is the same as the false +/// operand, we can convert the PseudoVMERGE_VVM to a PseudoVMV_V_V. +/// +/// %true = PseudoVADD_VV_M1_MASK %false, %x, %y, %mask, vl1, sew, policy +/// %x = PseudoVMERGE_VVM %passthru, %false, %true, %mask, vl2, sew +/// -> +/// %true = PseudoVADD_VV_M1_MASK %false, %x, %y, %mask, vl1, sew, policy +/// %x = PseudoVMV_V_V %passthru, %true, vl2, sew, tu_mu +bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) const { + unsigned NewOpc = getVMV_V_VOpcodeForVMERGE_VVM(MI); + if (!NewOpc) + return false; + MachineInstr *True = MRI->getVRegDef(MI.getOperand(3).getReg()); + if (!True || !RISCV::getMaskedPseudoInfo(True->getOpcode()) || + !hasSameEEW(MI, *True)) + return false; + + // True's passthru needs to be equivalent to False + Register TruePassthruReg = True->getOperand(1).getReg(); + Register FalseReg = MI.getOperand(2).getReg(); + if (TruePassthruReg != RISCV::NoRegister && TruePassthruReg != FalseReg) + return false; + + const MachineInstr *TrueV0Def = V0Defs.lookup(True); + const MachineInstr *MIV0Def = V0Defs.lookup(&MI); + assert(TrueV0Def && TrueV0Def->isCopy() && MIV0Def && MIV0Def->isCopy()); + if (TrueV0Def->getOperand(1).getReg() != MIV0Def->getOperand(1).getReg()) + return false; + + MI.setDesc(TII->get(NewOpc)); + MI.removeOperand(2); // False operand + MI.removeOperand(3); // Mask operand MI.addOperand( MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED)); @@ -623,7 +674,8 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { Changed |= tryToReduceVL(MI); Changed |= convertToUnmasked(MI); Changed |= convertToWholeRegister(MI); - Changed |= convertVMergeToVMv(MI); + Changed |= convertAllOnesVMergeToVMv(MI); + Changed |= convertSameMaskVMergeToVMv(MI); if (foldUndefPassthruVMV_V_V(MI)) { Changed |= true; continue; // MI is erased diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index e57b6a22dd6ea..569ada7949b1b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -62,12 +62,11 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB1_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vadd.vv v9, v10, v9 -; CHECK-NEXT: vse8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a1), a4, v0.t +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB1_1 @@ -344,12 +343,11 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB7_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t -; CHECK-NEXT: vadd.vv v9, v10, v9 -; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vlse8.v v9, (a0), a4, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse8.v v8, (a0), a4, v0.t ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB7_1 diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir index 19a918148e6eb..875d4229bbc6e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir @@ -68,3 +68,73 @@ body: | $v0 = COPY %mask %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5 ... +--- +name: same_mask +body: | + bb.0: + liveins: $v8, $v9, $v0 + ; CHECK-LABEL: name: same_mask + ; CHECK: liveins: $v8, $v9, $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %pt:vr = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v9 + ; CHECK-NEXT: %mask:vr = COPY $v0 + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */ + %pt:vrnov0 = COPY $v8 + %false:vrnov0 = COPY $v9 + %mask:vr = COPY $v0 + $v0 = COPY %mask + %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + $v0 = COPY %mask + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ +... +--- +# Shouldn't be converted because false operands are different +name: same_mask_different_false +body: | + bb.0: + liveins: $v8, $v9, $v0 + ; CHECK-LABEL: name: same_mask_different_false + ; CHECK: liveins: $v8, $v9, $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v9 + ; CHECK-NEXT: %mask:vr = COPY $v0 + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + %pt:vrnov0 = COPY $v8 + %false:vrnov0 = COPY $v9 + %mask:vr = COPY $v0 + $v0 = COPY %mask + %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + $v0 = COPY %mask + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ +... +--- +# Shouldn't be converted because EEWs are different +name: same_mask_different_eew +body: | + bb.0: + liveins: $v8, $v9, $v0 + ; CHECK-LABEL: name: same_mask_different_eew + ; CHECK: liveins: $v8, $v9, $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v9 + ; CHECK-NEXT: %mask:vr = COPY $v0 + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + %pt:vrnov0 = COPY $v8 + %false:vrnov0 = COPY $v9 + %mask:vr = COPY $v0 + $v0 = COPY %mask + %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */ + $v0 = COPY %mask + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ From 11084c5c49f8bb7825f81adc5b7140b3506fe253 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 5 Sep 2024 18:00:46 -0700 Subject: [PATCH 319/425] [lldb] Convert DebuggerThread.cpp to new Status API (NFC) --- lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp index 9a5fcd88e1282..d62eb26ca1a29 100644 --- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp @@ -75,7 +75,7 @@ Status DebuggerThread::DebugAttach(lldb::pid_t pid, return DebuggerThreadAttachRoutine(pid, attach_info); }); if (!secondary_thread) { - result = Status(secondary_thread.takeError()); + result = Status::FromError(secondary_thread.takeError()); LLDB_LOG(log, "couldn't attach to process '{0}'. {1}", pid, result); } From 16df489fdae23e77eb5180e4d4dc99b07421bf77 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 5 Sep 2024 18:35:55 -0700 Subject: [PATCH 320/425] [TableGen] Add const variants of accessors for backend (#106658) Split RecordKeeper `getAllDerivedDefinitions` family of functions into two variants: (a) non-const ones that return vectors of `Record *` and (b) const ones, that return vector/ArrayRef of `const Record *`. This will help gradual migration of TableGen backends to use `const RecordKeeper` and by implication change code to work with const pointers and better const correctness. Existing backends are not yet compatible with the const family of functions, so change them to use a non-constant `RecordKeeper` reference, till they are migrated. --- clang/utils/TableGen/ClangAttrEmitter.cpp | 4 +- clang/utils/TableGen/ClangSyntaxEmitter.cpp | 2 +- llvm/include/llvm/TableGen/DirectiveEmitter.h | 5 +- llvm/include/llvm/TableGen/Record.h | 34 ++++++++-- llvm/lib/TableGen/Record.cpp | 65 +++++++++++++++---- .../TableGen/Basic/CodeGenIntrinsics.cpp | 2 +- .../TableGen/Common/SubtargetFeatureInfo.cpp | 2 +- .../TableGen/Common/SubtargetFeatureInfo.h | 2 +- llvm/utils/TableGen/ExegesisEmitter.cpp | 2 +- llvm/utils/TableGen/GlobalISelEmitter.cpp | 2 +- llvm/utils/TableGen/SubtargetEmitter.cpp | 2 +- llvm/utils/TableGen/TableGen.cpp | 2 +- mlir/include/mlir/TableGen/GenInfo.h | 6 +- mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 28 ++++---- mlir/tools/mlir-tblgen/OmpOpGen.cpp | 2 +- mlir/tools/mlir-tblgen/OpDocGen.cpp | 20 +++--- mlir/tools/mlir-tblgen/OpInterfacesGen.cpp | 15 ++--- mlir/tools/mlir-tblgen/RewriterGen.cpp | 8 +-- .../tools/tblgen-to-irdl/OpDefinitionsGen.cpp | 7 +- 19 files changed, 134 insertions(+), 76 deletions(-) diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index adbe6af62d5cb..d24215d10f17c 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -189,7 +189,7 @@ static StringRef NormalizeGNUAttrSpelling(StringRef AttrSpelling) { typedef std::vector> ParsedAttrMap; -static ParsedAttrMap getParsedAttrList(const RecordKeeper &Records, +static ParsedAttrMap getParsedAttrList(RecordKeeper &Records, ParsedAttrMap *Dupes = nullptr, bool SemaOnly = true) { std::vector Attrs = Records.getAllDerivedDefinitions("Attr"); @@ -4344,7 +4344,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) { // written into OS and the checks for merging declaration attributes are // written into MergeOS. static void GenerateMutualExclusionsChecks(const Record &Attr, - const RecordKeeper &Records, + RecordKeeper &Records, raw_ostream &OS, raw_ostream &MergeDeclOS, raw_ostream &MergeStmtOS) { diff --git a/clang/utils/TableGen/ClangSyntaxEmitter.cpp b/clang/utils/TableGen/ClangSyntaxEmitter.cpp index 9720d58731843..2a69e4c353b6b 100644 --- a/clang/utils/TableGen/ClangSyntaxEmitter.cpp +++ b/clang/utils/TableGen/ClangSyntaxEmitter.cpp @@ -41,7 +41,7 @@ using llvm::formatv; // stable and useful way, where abstract Node subclasses correspond to ranges. class Hierarchy { public: - Hierarchy(const llvm::RecordKeeper &Records) { + Hierarchy(llvm::RecordKeeper &Records) { for (llvm::Record *T : Records.getAllDerivedDefinitions("NodeType")) add(T); for (llvm::Record *Derived : Records.getAllDerivedDefinitions("NodeType")) diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h index 1121459be6ce7..ca21c8fc10145 100644 --- a/llvm/include/llvm/TableGen/DirectiveEmitter.h +++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h @@ -15,8 +15,7 @@ namespace llvm { // DirectiveBase.td and provides helper methods for accessing it. class DirectiveLanguage { public: - explicit DirectiveLanguage(const llvm::RecordKeeper &Records) - : Records(Records) { + explicit DirectiveLanguage(llvm::RecordKeeper &Records) : Records(Records) { const auto &DirectiveLanguages = getDirectiveLanguages(); Def = DirectiveLanguages[0]; } @@ -71,7 +70,7 @@ class DirectiveLanguage { private: const llvm::Record *Def; - const llvm::RecordKeeper &Records; + llvm::RecordKeeper &Records; std::vector getDirectiveLanguages() const { return Records.getAllDerivedDefinitions("DirectiveLanguage"); diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index ff596df94e4f5..5d36fcf57e23e 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -2057,19 +2057,28 @@ class RecordKeeper { //===--------------------------------------------------------------------===// // High-level helper methods, useful for tablegen backends. + // Non-const methods return std::vector by value or reference. + // Const methods return std::vector by value or + // ArrayRef. + /// Get all the concrete records that inherit from the one specified /// class. The class must be defined. - std::vector getAllDerivedDefinitions(StringRef ClassName) const; + ArrayRef getAllDerivedDefinitions(StringRef ClassName) const; + const std::vector &getAllDerivedDefinitions(StringRef ClassName); /// Get all the concrete records that inherit from all the specified /// classes. The classes must be defined. - std::vector getAllDerivedDefinitions( - ArrayRef ClassNames) const; + std::vector + getAllDerivedDefinitions(ArrayRef ClassNames) const; + std::vector + getAllDerivedDefinitions(ArrayRef ClassNames); /// Get all the concrete records that inherit from specified class, if the /// class is defined. Returns an empty vector if the class is not defined. - std::vector + ArrayRef getAllDerivedDefinitionsIfDefined(StringRef ClassName) const; + const std::vector & + getAllDerivedDefinitionsIfDefined(StringRef ClassName); void dump() const; @@ -2081,9 +2090,24 @@ class RecordKeeper { RecordKeeper &operator=(RecordKeeper &&) = delete; RecordKeeper &operator=(const RecordKeeper &) = delete; + // Helper template functions for backend accessors. + template + const VecTy & + getAllDerivedDefinitionsImpl(StringRef ClassName, + std::map &Cache) const; + + template + VecTy getAllDerivedDefinitionsImpl(ArrayRef ClassNames) const; + + template + const VecTy &getAllDerivedDefinitionsIfDefinedImpl( + StringRef ClassName, std::map &Cache) const; + std::string InputFilename; RecordMap Classes, Defs; - mutable StringMap> ClassRecordsMap; + mutable std::map> + ClassRecordsMapConst; + mutable std::map> ClassRecordsMap; GlobalMap ExtraGlobals; // These members are for the phase timing feature. We need a timer group, diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index cead8f865a607..17afa2f7eb1b9 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -3248,25 +3248,28 @@ void RecordKeeper::stopBackendTimer() { } } -std::vector -RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const { +template +const VecTy &RecordKeeper::getAllDerivedDefinitionsImpl( + StringRef ClassName, std::map &Cache) const { // We cache the record vectors for single classes. Many backends request // the same vectors multiple times. - auto Pair = ClassRecordsMap.try_emplace(ClassName); + auto Pair = Cache.try_emplace(ClassName.str()); if (Pair.second) - Pair.first->second = getAllDerivedDefinitions(ArrayRef(ClassName)); + Pair.first->second = + getAllDerivedDefinitionsImpl(ArrayRef(ClassName)); return Pair.first->second; } -std::vector RecordKeeper::getAllDerivedDefinitions( +template +VecTy RecordKeeper::getAllDerivedDefinitionsImpl( ArrayRef ClassNames) const { - SmallVector ClassRecs; - std::vector Defs; + SmallVector ClassRecs; + VecTy Defs; assert(ClassNames.size() > 0 && "At least one class must be passed."); for (const auto &ClassName : ClassNames) { - Record *Class = getClass(ClassName); + const Record *Class = getClass(ClassName); if (!Class) PrintFatalError("The class '" + ClassName + "' is not defined\n"); ClassRecs.push_back(Class); @@ -3274,20 +3277,54 @@ std::vector RecordKeeper::getAllDerivedDefinitions( for (const auto &OneDef : getDefs()) { if (all_of(ClassRecs, [&OneDef](const Record *Class) { - return OneDef.second->isSubClassOf(Class); - })) + return OneDef.second->isSubClassOf(Class); + })) Defs.push_back(OneDef.second.get()); } - llvm::sort(Defs, LessRecord()); - return Defs; } +template +const VecTy &RecordKeeper::getAllDerivedDefinitionsIfDefinedImpl( + StringRef ClassName, std::map &Cache) const { + return getClass(ClassName) + ? getAllDerivedDefinitionsImpl(ClassName, Cache) + : Cache[""]; +} + +ArrayRef +RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const { + return getAllDerivedDefinitionsImpl>( + ClassName, ClassRecordsMapConst); +} + +const std::vector & +RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) { + return getAllDerivedDefinitionsImpl>(ClassName, + ClassRecordsMap); +} + +std::vector +RecordKeeper::getAllDerivedDefinitions(ArrayRef ClassNames) const { + return getAllDerivedDefinitionsImpl>(ClassNames); +} + std::vector +RecordKeeper::getAllDerivedDefinitions(ArrayRef ClassNames) { + return getAllDerivedDefinitionsImpl>(ClassNames); +} + +ArrayRef RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) const { - return getClass(ClassName) ? getAllDerivedDefinitions(ClassName) - : std::vector(); + return getAllDerivedDefinitionsIfDefinedImpl>( + ClassName, ClassRecordsMapConst); +} + +const std::vector & +RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) { + return getAllDerivedDefinitionsIfDefinedImpl>( + ClassName, ClassRecordsMap); } void RecordKeeper::dumpAllocationStats(raw_ostream &OS) const { diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index 23c64912c780f..05104e938b848 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -43,7 +43,7 @@ CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) { CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { CodeGenIntrinsicContext Ctx(RC); - std::vector Defs = RC.getAllDerivedDefinitions("Intrinsic"); + ArrayRef Defs = RC.getAllDerivedDefinitions("Intrinsic"); Intrinsics.reserve(Defs.size()); for (const Record *Def : Defs) diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp index 4f57234d6fe27..a4d6d8d21b356 100644 --- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp +++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp @@ -21,7 +21,7 @@ LLVM_DUMP_METHOD void SubtargetFeatureInfo::dump() const { #endif std::vector> -SubtargetFeatureInfo::getAll(const RecordKeeper &Records) { +SubtargetFeatureInfo::getAll(RecordKeeper &Records) { std::vector> SubtargetFeatures; std::vector AllPredicates = Records.getAllDerivedDefinitions("Predicate"); diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h index 2635e4b733e1a..fee2c0263c496 100644 --- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h +++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h @@ -49,7 +49,7 @@ struct SubtargetFeatureInfo { void dump() const; static std::vector> - getAll(const RecordKeeper &Records); + getAll(RecordKeeper &Records); /// Emit the subtarget feature flag definitions. /// diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp index d48c7f3a480f2..0de7cb4233748 100644 --- a/llvm/utils/TableGen/ExegesisEmitter.cpp +++ b/llvm/utils/TableGen/ExegesisEmitter.cpp @@ -59,7 +59,7 @@ class ExegesisEmitter { }; static std::map -collectPfmCounters(const RecordKeeper &Records) { +collectPfmCounters(RecordKeeper &Records) { std::map PfmCounterNameTable; const auto AddPfmCounterName = [&PfmCounterNameTable]( const Record *PfmCounterDef) { diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index a491a049e7c81..2606768c0c582 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -335,7 +335,7 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { private: std::string ClassName; - const RecordKeeper &RK; + RecordKeeper &RK; const CodeGenDAGPatterns CGP; const CodeGenTarget &Target; CodeGenRegBank &CGRegs; diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 66ca38ee5ae2f..7ae61cb7c446b 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -1545,7 +1545,7 @@ void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) { EmitProcessorModels(OS); } -static void emitPredicateProlog(const RecordKeeper &Records, raw_ostream &OS) { +static void emitPredicateProlog(RecordKeeper &Records, raw_ostream &OS) { std::string Buffer; raw_string_ostream Stream(Buffer); diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp index 7ee6fa5c83211..882410bac081b 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -52,7 +52,7 @@ static void PrintEnums(RecordKeeper &Records, raw_ostream &OS) { static void PrintSets(const RecordKeeper &Records, raw_ostream &OS) { SetTheory Sets; Sets.addFieldExpander("Set", "Elements"); - for (Record *Rec : Records.getAllDerivedDefinitions("Set")) { + for (const Record *Rec : Records.getAllDerivedDefinitions("Set")) { OS << Rec->getName() << " = ["; const std::vector *Elts = Sets.expand(Rec); assert(Elts && "Couldn't expand Set instance"); diff --git a/mlir/include/mlir/TableGen/GenInfo.h b/mlir/include/mlir/TableGen/GenInfo.h index ef2e12f07df16..d59d64223827b 100644 --- a/mlir/include/mlir/TableGen/GenInfo.h +++ b/mlir/include/mlir/TableGen/GenInfo.h @@ -21,8 +21,8 @@ class RecordKeeper; namespace mlir { /// Generator function to invoke. -using GenFunction = std::function; +using GenFunction = + std::function; /// Structure to group information about a generator (argument to invoke via /// mlir-tblgen, description, and generator function). @@ -34,7 +34,7 @@ class GenInfo { : arg(arg), description(description), generator(std::move(generator)) {} /// Invokes the generator and returns whether the generator failed. - bool invoke(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) const { + bool invoke(llvm::RecordKeeper &recordKeeper, raw_ostream &os) const { assert(generator && "Cannot call generator with null generator"); return generator(recordKeeper, os); } diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index eccd8029d950f..feca04bff643d 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -690,10 +690,10 @@ class DefGenerator { bool emitDefs(StringRef selectedDialect); protected: - DefGenerator(std::vector &&defs, raw_ostream &os, + DefGenerator(const std::vector &defs, raw_ostream &os, StringRef defType, StringRef valueType, bool isAttrGenerator) - : defRecords(std::move(defs)), os(os), defType(defType), - valueType(valueType), isAttrGenerator(isAttrGenerator) { + : defRecords(defs), os(os), defType(defType), valueType(valueType), + isAttrGenerator(isAttrGenerator) { // Sort by occurrence in file. llvm::sort(defRecords, [](llvm::Record *lhs, llvm::Record *rhs) { return lhs->getID() < rhs->getID(); @@ -721,13 +721,13 @@ class DefGenerator { /// A specialized generator for AttrDefs. struct AttrDefGenerator : public DefGenerator { - AttrDefGenerator(const llvm::RecordKeeper &records, raw_ostream &os) + AttrDefGenerator(llvm::RecordKeeper &records, raw_ostream &os) : DefGenerator(records.getAllDerivedDefinitionsIfDefined("AttrDef"), os, "Attr", "Attribute", /*isAttrGenerator=*/true) {} }; /// A specialized generator for TypeDefs. struct TypeDefGenerator : public DefGenerator { - TypeDefGenerator(const llvm::RecordKeeper &records, raw_ostream &os) + TypeDefGenerator(llvm::RecordKeeper &records, raw_ostream &os) : DefGenerator(records.getAllDerivedDefinitionsIfDefined("TypeDef"), os, "Type", "Type", /*isAttrGenerator=*/false) {} }; @@ -1029,7 +1029,7 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) { /// Find all type constraints for which a C++ function should be generated. static std::vector -getAllTypeConstraints(const llvm::RecordKeeper &records) { +getAllTypeConstraints(llvm::RecordKeeper &records) { std::vector result; for (llvm::Record *def : records.getAllDerivedDefinitionsIfDefined("TypeConstraint")) { @@ -1046,7 +1046,7 @@ getAllTypeConstraints(const llvm::RecordKeeper &records) { return result; } -static void emitTypeConstraintDecls(const llvm::RecordKeeper &records, +static void emitTypeConstraintDecls(llvm::RecordKeeper &records, raw_ostream &os) { static const char *const typeConstraintDecl = R"( bool {0}(::mlir::Type type); @@ -1056,7 +1056,7 @@ bool {0}(::mlir::Type type); os << strfmt(typeConstraintDecl, *constr.getCppFunctionName()); } -static void emitTypeConstraintDefs(const llvm::RecordKeeper &records, +static void emitTypeConstraintDefs(llvm::RecordKeeper &records, raw_ostream &os) { static const char *const typeConstraintDef = R"( bool {0}(::mlir::Type type) { @@ -1087,13 +1087,13 @@ static llvm::cl::opt static mlir::GenRegistration genAttrDefs("gen-attrdef-defs", "Generate AttrDef definitions", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { AttrDefGenerator generator(records, os); return generator.emitDefs(attrDialect); }); static mlir::GenRegistration genAttrDecls("gen-attrdef-decls", "Generate AttrDef declarations", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { AttrDefGenerator generator(records, os); return generator.emitDecls(attrDialect); }); @@ -1109,13 +1109,13 @@ static llvm::cl::opt static mlir::GenRegistration genTypeDefs("gen-typedef-defs", "Generate TypeDef definitions", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { TypeDefGenerator generator(records, os); return generator.emitDefs(typeDialect); }); static mlir::GenRegistration genTypeDecls("gen-typedef-decls", "Generate TypeDef declarations", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { TypeDefGenerator generator(records, os); return generator.emitDecls(typeDialect); }); @@ -1123,14 +1123,14 @@ static mlir::GenRegistration static mlir::GenRegistration genTypeConstrDefs("gen-type-constraint-defs", "Generate type constraint definitions", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { emitTypeConstraintDefs(records, os); return false; }); static mlir::GenRegistration genTypeConstrDecls("gen-type-constraint-decls", "Generate type constraint declarations", - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { emitTypeConstraintDecls(records, os); return false; }); diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp index ffa2e17cc8f91..b7f6ca975a9a3 100644 --- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp +++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp @@ -149,7 +149,7 @@ static void verifyClause(Record *op, Record *clause) { /// Verify that all properties of `OpenMP_Clause`s of records deriving from /// `OpenMP_Op`s have been inherited by the latter. -static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) { +static bool verifyDecls(RecordKeeper &recordKeeper, raw_ostream &) { for (Record *op : recordKeeper.getAllDerivedDefinitions("OpenMP_Op")) { for (Record *clause : op->getValueAsListOfDefs("clauseList")) verifyClause(op, clause); diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 71df80cd110f1..066e5b24f5a3c 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -282,7 +282,7 @@ static void emitSourceLink(StringRef inputFilename, raw_ostream &os) { << inputFromMlirInclude << ")\n\n"; } -static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { +static void emitOpDoc(RecordKeeper &recordKeeper, raw_ostream &os) { auto opDefs = getRequestedOpDefinitions(recordKeeper); os << "\n"; @@ -371,8 +371,8 @@ static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) { os << "\n"; } -static void emitAttrOrTypeDefDoc(const RecordKeeper &recordKeeper, - raw_ostream &os, StringRef recordTypeName) { +static void emitAttrOrTypeDefDoc(RecordKeeper &recordKeeper, raw_ostream &os, + StringRef recordTypeName) { std::vector defs = recordKeeper.getAllDerivedDefinitions(recordTypeName); @@ -405,7 +405,7 @@ static void emitEnumDoc(const EnumAttr &def, raw_ostream &os) { os << "\n"; } -static void emitEnumDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { +static void emitEnumDoc(RecordKeeper &recordKeeper, raw_ostream &os) { std::vector defs = recordKeeper.getAllDerivedDefinitions("EnumAttr"); @@ -518,7 +518,7 @@ static void emitDialectDoc(const Dialect &dialect, StringRef inputFilename, os); } -static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { +static bool emitDialectDoc(RecordKeeper &recordKeeper, raw_ostream &os) { std::vector dialectDefs = recordKeeper.getAllDerivedDefinitionsIfDefined("Dialect"); SmallVector dialects(dialectDefs.begin(), dialectDefs.end()); @@ -617,34 +617,34 @@ static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { static mlir::GenRegistration genAttrRegister("gen-attrdef-doc", "Generate dialect attribute documentation", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { emitAttrOrTypeDefDoc(records, os, "AttrDef"); return false; }); static mlir::GenRegistration genOpRegister("gen-op-doc", "Generate dialect documentation", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { emitOpDoc(records, os); return false; }); static mlir::GenRegistration genTypeRegister("gen-typedef-doc", "Generate dialect type documentation", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { emitAttrOrTypeDefDoc(records, os, "TypeDef"); return false; }); static mlir::GenRegistration genEnumRegister("gen-enum-doc", "Generate dialect enum documentation", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { emitEnumDoc(records, os); return false; }); static mlir::GenRegistration genRegister("gen-dialect-doc", "Generate dialect documentation", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { return emitDialectDoc(records, os); }); diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp index 4b06b92fbc8a8..00f21a1cefbdd 100644 --- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp +++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp @@ -62,8 +62,7 @@ static void emitMethodNameAndArgs(const InterfaceMethod &method, /// Get an array of all OpInterface definitions but exclude those subclassing /// "DeclareOpInterfaceMethods". static std::vector -getAllInterfaceDefinitions(const llvm::RecordKeeper &recordKeeper, - StringRef name) { +getAllInterfaceDefinitions(llvm::RecordKeeper &recordKeeper, StringRef name) { std::vector defs = recordKeeper.getAllDerivedDefinitions((name + "Interface").str()); @@ -118,7 +117,7 @@ class InterfaceGenerator { /// A specialized generator for attribute interfaces. struct AttrInterfaceGenerator : public InterfaceGenerator { - AttrInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os) + AttrInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os) : InterfaceGenerator(getAllInterfaceDefinitions(records, "Attr"), os) { valueType = "::mlir::Attribute"; interfaceBaseType = "AttributeInterface"; @@ -133,7 +132,7 @@ struct AttrInterfaceGenerator : public InterfaceGenerator { }; /// A specialized generator for operation interfaces. struct OpInterfaceGenerator : public InterfaceGenerator { - OpInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os) + OpInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os) : InterfaceGenerator(getAllInterfaceDefinitions(records, "Op"), os) { valueType = "::mlir::Operation *"; interfaceBaseType = "OpInterface"; @@ -149,7 +148,7 @@ struct OpInterfaceGenerator : public InterfaceGenerator { }; /// A specialized generator for type interfaces. struct TypeInterfaceGenerator : public InterfaceGenerator { - TypeInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os) + TypeInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os) : InterfaceGenerator(getAllInterfaceDefinitions(records, "Type"), os) { valueType = "::mlir::Type"; interfaceBaseType = "TypeInterface"; @@ -684,15 +683,15 @@ struct InterfaceGenRegistration { genDefDesc(("Generate " + genDesc + " interface definitions").str()), genDocDesc(("Generate " + genDesc + " interface documentation").str()), genDecls(genDeclArg, genDeclDesc, - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { return GeneratorT(records, os).emitInterfaceDecls(); }), genDefs(genDefArg, genDefDesc, - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { return GeneratorT(records, os).emitInterfaceDefs(); }), genDocs(genDocArg, genDocDesc, - [](const llvm::RecordKeeper &records, raw_ostream &os) { + [](llvm::RecordKeeper &records, raw_ostream &os) { return GeneratorT(records, os).emitInterfaceDocs(); }) {} diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp index 2c79ba2cd6353..401f02246ed23 100644 --- a/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -268,7 +268,7 @@ class PatternEmitter { // inlining them. class StaticMatcherHelper { public: - StaticMatcherHelper(raw_ostream &os, const RecordKeeper &recordKeeper, + StaticMatcherHelper(raw_ostream &os, RecordKeeper &recordKeeper, RecordOperatorMap &mapper); // Determine if we should inline the match logic or delegate to a static @@ -1886,7 +1886,7 @@ void PatternEmitter::createAggregateLocalVarsForOpArgs( } StaticMatcherHelper::StaticMatcherHelper(raw_ostream &os, - const RecordKeeper &recordKeeper, + RecordKeeper &recordKeeper, RecordOperatorMap &mapper) : opMap(mapper), staticVerifierEmitter(os, recordKeeper) {} @@ -1951,7 +1951,7 @@ StringRef StaticMatcherHelper::getVerifierName(DagLeaf leaf) { return staticVerifierEmitter.getTypeConstraintFn(leaf.getAsConstraint()); } -static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) { +static void emitRewriters(RecordKeeper &recordKeeper, raw_ostream &os) { emitSourceFileHeader("Rewriters", os, recordKeeper); const auto &patterns = recordKeeper.getAllDerivedDefinitions("Pattern"); @@ -2001,7 +2001,7 @@ static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) { static mlir::GenRegistration genRewriters("gen-rewriters", "Generate pattern rewriters", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { emitRewriters(records, os); return false; }); diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index a55f3539f31db..0957a5d55db95 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -146,14 +146,13 @@ static irdl::DialectOp createIRDLDialect(OpBuilder &builder) { } static std::vector -getOpDefinitions(const RecordKeeper &recordKeeper) { +getOpDefinitions(RecordKeeper &recordKeeper) { if (!recordKeeper.getClass("Op")) return {}; return recordKeeper.getAllDerivedDefinitions("Op"); } -static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper, - raw_ostream &os) { +static bool emitDialectIRDLDefs(RecordKeeper &recordKeeper, raw_ostream &os) { // Initialize. MLIRContext ctx; ctx.getOrLoadDialect(); @@ -185,6 +184,6 @@ static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper, static mlir::GenRegistration genOpDefs("gen-dialect-irdl-defs", "Generate IRDL dialect definitions", - [](const RecordKeeper &records, raw_ostream &os) { + [](RecordKeeper &records, raw_ostream &os) { return emitDialectIRDLDefs(records, os); }); From 616a8ce6203d8c7569266bfaf163e74df1f440ad Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Thu, 5 Sep 2024 18:45:21 -0700 Subject: [PATCH 321/425] [clang-format] Correctly annotate braces in macro definition (#107352) Also add a test case for #107096. Fixes #106418. --- clang/lib/Format/UnwrappedLineParser.cpp | 3 ++- clang/unittests/Format/TokenAnnotatorTest.cpp | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 246b29d308bfa..1727ed93822b1 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -570,7 +570,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { NextTok->isOneOf(Keywords.kw_of, Keywords.kw_in, Keywords.kw_as)); ProbablyBracedList = - ProbablyBracedList || (IsCpp && NextTok->is(tok::l_paren)); + ProbablyBracedList || (IsCpp && (PrevTok->Tok.isLiteral() || + NextTok->is(tok::l_paren))); // If there is a comma, semicolon or right paren after the closing // brace, we assume this is a braced initializer list. diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index c0436d8a2e180..1bb796fd6f5ee 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3278,6 +3278,26 @@ TEST_F(TokenAnnotatorTest, BraceKind) { EXPECT_BRACE_KIND(Tokens[10], BK_Block); EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace); EXPECT_BRACE_KIND(Tokens[11], BK_Block); + + Tokens = annotate("#define MACRO \\\n" + " struct hash { \\\n" + " void f() { return; } \\\n" + " };"); + ASSERT_EQ(Tokens.size(), 20u) << Tokens; + EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace); + EXPECT_BRACE_KIND(Tokens[8], BK_Block); + EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName); + EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_FunctionDeclarationLParen); + EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace); + EXPECT_BRACE_KIND(Tokens[13], BK_Block); + EXPECT_BRACE_KIND(Tokens[16], BK_Block); + EXPECT_TOKEN(Tokens[17], tok::r_brace, TT_StructRBrace); + EXPECT_BRACE_KIND(Tokens[17], BK_Block); + + Tokens = annotate("#define MEMBER(NAME) NAME{\"\"}"); + ASSERT_EQ(Tokens.size(), 11u) << Tokens; + EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); } TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) { From c02fd17c1e20615c9e6174a3f8ad4ef0ec5ebbec Mon Sep 17 00:00:00 2001 From: donald chen Date: Fri, 6 Sep 2024 10:02:05 +0800 Subject: [PATCH 322/425] [mlir] [scf] fix crash when conversion from scf to control flow (#107221) This patch fixed a crash when scf.parallel's region donesn't terminate with reduce op. This can happend in dialect conversion. --- mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp index 2372ab5b82a77..45f3bcfa393be 100644 --- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp +++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp @@ -482,7 +482,10 @@ LogicalResult ParallelLowering::matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const { Location loc = parallelOp.getLoc(); - auto reductionOp = cast(parallelOp.getBody()->getTerminator()); + auto reductionOp = dyn_cast(parallelOp.getBody()->getTerminator()); + if (!reductionOp) { + return failure(); + } // For a parallel loop, we essentially need to create an n-dimensional loop // nest. We do this by translating to scf.for ops and have those lowered in From 33ceb2dd7596a05277fd246865862df6b03cf976 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 19:04:30 -0700 Subject: [PATCH 323/425] [clang-tidy] Avoid repeated hash lookups (NFC) (#107490) --- .../cppcoreguidelines/PreferMemberInitializerCheck.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp index 5f046c502eb38..e516b71088425 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp @@ -67,9 +67,7 @@ static bool canAdvanceAssignment(AssignedLevel Level) { static void updateAssignmentLevel( const FieldDecl *Field, const Expr *Init, const CXXConstructorDecl *Ctor, llvm::DenseMap &AssignedFields) { - auto It = AssignedFields.find(Field); - if (It == AssignedFields.end()) - It = AssignedFields.insert({Field, AssignedLevel::None}).first; + auto It = AssignedFields.try_emplace(Field, AssignedLevel::None).first; if (!canAdvanceAssignment(It->second)) // fast path for already decided field. From 144314eaa5ca7f44817cf0ac162dbd17a5d88391 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 5 Sep 2024 19:04:56 -0700 Subject: [PATCH 324/425] [SLPVectorizer] Avoid repeated hash lookups (NFC) (#107491) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a77d236413a96..c87d1055c8bc5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5854,13 +5854,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // Build a map between user nodes and their operands order to speedup // search. The graph currently does not provide this dependency directly. - for (EdgeInfo &EI : TE->UserTreeIndices) { - TreeEntry *UserTE = EI.UserTE; - auto It = Users.find(UserTE); - if (It == Users.end()) - It = Users.insert({UserTE, {}}).first; - It->second.emplace_back(EI.EdgeIdx, TE); - } + for (EdgeInfo &EI : TE->UserTreeIndices) + Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE); } // Erase filtered entries. for (TreeEntry *TE : Filtered) From 5acd9d11373ca67f0d4baf17a78ebb56193a7df0 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 5 Sep 2024 19:13:08 -0700 Subject: [PATCH 325/425] [clang][scan] Report module dependencies in topological order (#107474) --- .../Tooling/DependencyScanning/ModuleDepCollector.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 370d834846859..c775adc0ddd73 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -569,12 +569,11 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { return {}; // If this module has been handled already, just return its ID. - auto ModI = MDC.ModularDeps.insert({M, nullptr}); - if (!ModI.second) - return ModI.first->second->ID; + if (auto ModI = MDC.ModularDeps.find(M); ModI != MDC.ModularDeps.end()) + return ModI->second->ID; - ModI.first->second = std::make_unique(); - ModuleDeps &MD = *ModI.first->second; + auto OwnedMD = std::make_unique(); + ModuleDeps &MD = *OwnedMD; MD.ID.ModuleName = M->getFullModuleName(); MD.IsSystem = M->IsSystem; @@ -650,6 +649,8 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { MD.BuildInfo = std::move(CI); + MDC.ModularDeps.insert({M, std::move(OwnedMD)}); + return MD.ID; } From ede40da1f8c1e91601b985cd32ad785aa8806880 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Fri, 6 Sep 2024 10:45:59 +0800 Subject: [PATCH 326/425] [mlir][tensor] Add check for indices of `tensor.gather` (#106894) This patch add a check for indices of `tensor.gather` and `tensor.scatter`. For that the length of gather_dims/scatter_dims should match the size of last dimension of the indices. Fix #94901. --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 12 +++- mlir/test/Dialect/Tensor/invalid.mlir | 80 ++++++++++++++++++------ 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 996de530c255d..5fbb3d55f8faa 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -1288,7 +1288,8 @@ RankedTensorType GatherOp::inferResultType(RankedTensorType sourceType, } static LogicalResult -verifyGatherOrScatterDims(Operation *op, ArrayRef dims, int64_t rank, +verifyGatherOrScatterDims(Operation *op, ArrayRef dims, + ArrayRef indices, int64_t rank, StringRef gatherOrScatter, StringRef sourceOrDest) { if (dims.empty()) return op->emitOpError(gatherOrScatter) << "_dims must be non-empty"; @@ -1297,6 +1298,9 @@ verifyGatherOrScatterDims(Operation *op, ArrayRef dims, int64_t rank, if (numGatherDims > rank) return op->emitOpError(gatherOrScatter) << "_dims overflow " << sourceOrDest << " rank"; + if (indices.empty() || indices.back() != numGatherDims) + return op->emitOpError(gatherOrScatter) + << "_dims length must match the size of last dimension of indices"; for (int64_t val : dims) { if (val < 0) return op->emitOpError(gatherOrScatter) @@ -1316,7 +1320,8 @@ verifyGatherOrScatterDims(Operation *op, ArrayRef dims, int64_t rank, LogicalResult GatherOp::verify() { int64_t sourceRank = getSourceType().getRank(); ArrayRef gatherDims = getGatherDims(); - if (failed(verifyGatherOrScatterDims(getOperation(), gatherDims, sourceRank, + if (failed(verifyGatherOrScatterDims(getOperation(), gatherDims, + getIndicesType().getShape(), sourceRank, "gather", "source"))) return failure(); @@ -3530,7 +3535,8 @@ void ScatterOp::getAsmResultNames( LogicalResult ScatterOp::verify() { int64_t destRank = getDestType().getRank(); ArrayRef scatterDims = getScatterDims(); - if (failed(verifyGatherOrScatterDims(getOperation(), scatterDims, destRank, + if (failed(verifyGatherOrScatterDims(getOperation(), scatterDims, + getIndicesType().getShape(), destRank, "scatter", "dest"))) return failure(); diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index d9db32b8801ac..84e6c59e403dd 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -455,41 +455,59 @@ func.func @gather_coordinate_rank_overflow( // ----- +func.func @gather_coordinate_rank_mismatch0( + %source: tensor<4x5x6xf32>, %indices: tensor) { + // expected-error@+1 {{gather_dims length must match the size of last dimension of indices}} + %out = tensor.gather %source[%indices] gather_dims([0, 1, 2]): + (tensor<4x5x6xf32>, tensor) -> tensor<1x2xf32> +} + +// ----- + +func.func @gather_coordinate_rank_mismatch1( + %source: tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { + // expected-error@+1 {{gather_dims length must match the size of last dimension of indices}} + %out = tensor.gather %source[%indices] gather_dims([0, 1, 2]): + (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2xf32> +} + +// ----- + func.func @gather_coordinate_negative( - %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %source : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) { // expected-error@+1 {{gather_dims value must be non-negative}} %out = tensor.gather %source[%indices] gather_dims([-1]): - (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32> return } // ----- func.func @gather_coordinate_overflow( - %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %source : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) { // expected-error@+1 {{gather_dims value must be smaller than source rank}} %out = tensor.gather %source[%indices] gather_dims([42]): - (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32> return } // ----- -func.func @gather_coordinate_overflow( - %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { +func.func @gather_coordinate_increase( + %source : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { // expected-error@+1 {{gather_dims values must be strictly increasing}} %out = tensor.gather %source[%indices] gather_dims([1, 0]): - (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1x1xf32> return } // ----- func.func @gather_wrong_result_type( - %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %source : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { // expected-error@+1 {{result type mismatch: expected 'tensor<1x2x1x5x1xf32>' or its rank-reduced variant 'tensor<1x2x5xf32>' (got: 'tensor<1x2x1xf32>')}} %out = tensor.gather %source[%indices] gather_dims([0, 2]): - (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32> + (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32> return } @@ -517,12 +535,34 @@ func.func @scatter_coordinate_rank_overflow( // ----- +func.func @scatter_coordinate_rank_mismatch0( + %source : tensor, + %dest : tensor<4x5x6xf32>, %indices: tensor) { + // expected-error@+1 {{scatter_dims length must match the size of last dimension of indices}} + %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 1, 2]) unique: + (tensor, tensor<4x5x6xf32>, tensor) -> tensor<1x2xf32> + return +} + +// ----- + +func.func @scatter_coordinate_rank_mismatch1( + %source : tensor, + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { + // expected-error@+1 {{scatter_dims length must match the size of last dimension of indices}} + %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 1, 2]) unique: + (tensor, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2xf32> + return +} + +// ----- + func.func @scatter_coordinate_negative( %source : tensor, - %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) { // expected-error@+1 {{scatter_dims value must be non-negative}} %out = tensor.scatter %source into %dest[%indices] scatter_dims([-1]) unique: - (tensor, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor, tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32> return } @@ -530,21 +570,21 @@ func.func @scatter_coordinate_negative( func.func @scatter_coordinate_overflow( %source : tensor, - %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) { // expected-error@+1 {{scatter_dims value must be smaller than dest rank}} %out = tensor.scatter %source into %dest[%indices] scatter_dims([42]) unique: - (tensor, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor, tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32> return } // ----- -func.func @scatter_coordinate_overflow( +func.func @scatter_coordinate_increase( %source : tensor, - %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { // expected-error@+1 {{scatter_dims values must be strictly increasing}} %out = tensor.scatter %source into %dest[%indices] scatter_dims([1, 0]) unique: - (tensor, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32> + (tensor, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1x1xf32> return } @@ -552,10 +592,10 @@ func.func @scatter_coordinate_overflow( func.func @scatter_missing_unique( %source : tensor, - %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { // expected-error@+1 {{requires 'unique' attribute to be set}} %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 2]): - (tensor, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32> + (tensor, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32> return } @@ -563,10 +603,10 @@ func.func @scatter_missing_unique( func.func @scatter_wrong_result_type( %source : tensor, - %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) { + %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) { // expected-error@+1 {{source type mismatch: expected 'tensor<1x2x1x5x1xf32>' or its rank-reduced variant 'tensor<1x2x5xf32>' (got: 'tensor')}} %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 2]) unique: - (tensor, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32> + (tensor, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32> return } From 8e35c86977ce5529a9387657321ac9fefcdae5b5 Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Thu, 5 Sep 2024 21:50:00 -0700 Subject: [PATCH 327/425] [HLSL] Apply resource attributes to the resource type rather than the handle member (#107160) Converts existing resource attributes `[[hlsl::resource_class(..)]]` and `[[is_rov]]` from declaration attributes to type attributes. During type attribute processing all HLSL resource type attributes are validated and collected by `SemaHLSL` (`SemaHLSL::handleResourceTypeAttr`). At the end of the declaration they are be combined into a single `HLSLAttributedResourceType` instance (`SemaHLSL::ProcessResourceTypeAttributes`) that wraps the original type and stores all of the necessary information about the resource. `SemaHLSL` will also need to short-term-store the `TypeLoc` information for the newly created type that will be grabbed by `TypeSpecLocFiller` soon after it is created. Updates all places that expected resource attributes on declarations like resource binding diagnostic, builtin types in HLSLExternalSemaSource, or codegen. Also includes implementation of `TreeTransform::TransformHLSLAttributedResourceType` that enables the use of attributed resource types inside templates. Fixes #104861 Part 2/2 --- clang/include/clang/AST/TypeLoc.h | 8 + clang/include/clang/Basic/Attr.td | 6 +- .../clang/Basic/DiagnosticSemaKinds.td | 1 + clang/include/clang/Sema/SemaHLSL.h | 20 +- clang/lib/AST/TypePrinter.cpp | 13 +- clang/lib/CodeGen/CGHLSLRuntime.cpp | 9 +- clang/lib/Sema/HLSLExternalSemaSource.cpp | 23 ++- clang/lib/Sema/SemaDeclAttr.cpp | 6 - clang/lib/Sema/SemaHLSL.cpp | 177 ++++++++++++------ clang/lib/Sema/TreeTransform.h | 22 ++- clang/test/AST/HLSL/RWBuffer-AST.hlsl | 10 +- ...a-attribute-supported-attributes-list.test | 2 - clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl | 17 +- .../ParserHLSL/hlsl_is_rov_attr_error.hlsl | 21 ++- .../ParserHLSL/hlsl_resource_class_attr.hlsl | 48 ++--- .../hlsl_resource_class_attr_error.hlsl | 27 ++- .../hlsl_resource_handle_attrs.hlsl | 17 +- .../SemaHLSL/resource_binding_attr_error.hlsl | 2 +- .../resource_binding_attr_error_resource.hlsl | 10 +- .../resource_binding_attr_error_udt.hlsl | 10 +- 20 files changed, 281 insertions(+), 168 deletions(-) diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h index 5db39eb3aefa7..03fbdcf60140d 100644 --- a/clang/include/clang/AST/TypeLoc.h +++ b/clang/include/clang/AST/TypeLoc.h @@ -951,12 +951,20 @@ class HLSLAttributedResourceTypeLoc HLSLAttributedResourceLocInfo> { public: TypeLoc getWrappedLoc() const { return getInnerTypeLoc(); } + + TypeLoc getContainedLoc() const { + return TypeLoc(getTypePtr()->getContainedType(), getNonLocalData()); + } + void setSourceRange(const SourceRange &R) { getLocalData()->Range = R; } SourceRange getLocalSourceRange() const { return getLocalData()->Range; } void initializeLocal(ASTContext &Context, SourceLocation loc) { setSourceRange(SourceRange()); } QualType getInnerType() const { return getTypePtr()->getWrappedType(); } + unsigned getLocalDataSize() const { + return sizeof(HLSLAttributedResourceLocInfo); + } }; struct ObjCObjectTypeLocInfo { diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 8d2a362abc3c3..0c98f8e25a6fb 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4643,16 +4643,14 @@ def HLSLResource : InheritableAttr { let Documentation = [InternalOnly]; } -def HLSLROV : InheritableAttr { +def HLSLROV : TypeAttr { let Spellings = [CXX11<"hlsl", "is_rov">]; - let Subjects = SubjectList<[Struct]>; let LangOpts = [HLSL]; let Documentation = [InternalOnly]; } -def HLSLResourceClass : InheritableAttr { +def HLSLResourceClass : TypeAttr { let Spellings = [CXX11<"hlsl", "resource_class">]; - let Subjects = SubjectList<[Field]>; let LangOpts = [HLSL]; let Args = [ EnumArgument<"ResourceClass", "llvm::hlsl::ResourceClass", diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 083684670a980..58819a64813fc 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12368,6 +12368,7 @@ def err_hlsl_packoffset_cross_reg_boundary : Error<"packoffset cannot cross regi def err_hlsl_packoffset_alignment_mismatch : Error<"packoffset at 'y' not match alignment %0 required by %1">; def err_hlsl_pointers_unsupported : Error< "%select{pointers|references}0 are unsupported in HLSL">; +def err_hlsl_missing_resource_class : Error<"HLSL resource needs to have [[hlsl::resource_class()]] attribute">; def err_hlsl_operator_unsupported : Error< "the '%select{&|*|->}0' operator is unsupported in HLSL">; diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 285e4e5f3c765..64b39ca7712ee 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -15,8 +15,10 @@ #include "clang/AST/ASTFwd.h" #include "clang/AST/Attr.h" +#include "clang/AST/Type.h" #include "clang/Basic/SourceLocation.h" #include "clang/Sema/SemaBase.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/TargetParser/Triple.h" #include @@ -26,6 +28,12 @@ class IdentifierInfo; class ParsedAttr; class Scope; +// FIXME: This can be hidden (as static function in SemaHLSL.cpp) once we no +// longer need to create builtin buffer types in HLSLExternalSemaSource. +bool CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped, + ArrayRef AttrList, + QualType &ResType); + class SemaHLSL : public SemaBase { public: SemaHLSL(Sema &S); @@ -59,8 +67,6 @@ class SemaHLSL : public SemaBase { void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL); void handleShaderAttr(Decl *D, const ParsedAttr &AL); - void handleROVAttr(Decl *D, const ParsedAttr &AL); - void handleResourceClassAttr(Decl *D, const ParsedAttr &AL); void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL); void handleParamModifierAttr(Decl *D, const ParsedAttr &AL); bool handleResourceTypeAttr(const ParsedAttr &AL); @@ -78,6 +84,16 @@ class SemaHLSL : public SemaBase { ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg); QualType getInoutParameterType(QualType Ty); + +private: + // HLSL resource type attributes need to be processed all at once. + // This is a list to collect them. + llvm::SmallVector HLSLResourcesTypeAttrs; + + /// SourceLocation corresponding to HLSLAttributedResourceTypeLocs that we + /// have not yet populated. + llvm::DenseMap + LocsForHLSLAttributedResources; }; } // namespace clang diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index b1d9516c96eb7..3bef9370e621c 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -1942,6 +1942,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::BTFTypeTag: llvm_unreachable("BTFTypeTag attribute handled separately"); + case attr::HLSLResourceClass: + case attr::HLSLROV: + llvm_unreachable("HLSL resource type attributes handled separately"); + case attr::OpenCLPrivateAddressSpace: case attr::OpenCLGlobalAddressSpace: case attr::OpenCLGlobalDeviceAddressSpace: @@ -2062,7 +2066,11 @@ void TypePrinter::printBTFTagAttributedAfter(const BTFTagAttributedType *T, void TypePrinter::printHLSLAttributedResourceBefore( const HLSLAttributedResourceType *T, raw_ostream &OS) { printBefore(T->getWrappedType(), OS); +} +void TypePrinter::printHLSLAttributedResourceAfter( + const HLSLAttributedResourceType *T, raw_ostream &OS) { + printAfter(T->getWrappedType(), OS); const HLSLAttributedResourceType::Attributes &Attrs = T->getAttrs(); OS << " [[hlsl::resource_class(" << HLSLResourceClassAttr::ConvertResourceClassToStr(Attrs.ResourceClass) @@ -2071,11 +2079,6 @@ void TypePrinter::printHLSLAttributedResourceBefore( OS << " [[hlsl::is_rov()]]"; } -void TypePrinter::printHLSLAttributedResourceAfter( - const HLSLAttributedResourceType *T, raw_ostream &OS) { - printAfter(T->getWrappedType(), OS); -} - void TypePrinter::printObjCInterfaceBefore(const ObjCInterfaceType *T, raw_ostream &OS) { OS << T->getDecl()->getName(); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 4bd7b6ba58de0..b6e6555e63fca 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -295,13 +295,14 @@ void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) { // inside the record decl for (auto *FD : RD->fields()) { const auto *HLSLResAttr = FD->getAttr(); - const auto *HLSLResClassAttr = FD->getAttr(); - if (!HLSLResAttr || !HLSLResClassAttr) + const HLSLAttributedResourceType *AttrResType = + dyn_cast(FD->getType().getTypePtr()); + if (!HLSLResAttr || !AttrResType) continue; - llvm::hlsl::ResourceClass RC = HLSLResClassAttr->getResourceClass(); + llvm::hlsl::ResourceClass RC = AttrResType->getAttrs().ResourceClass; + bool IsROV = AttrResType->getAttrs().IsROV; llvm::hlsl::ResourceKind RK = HLSLResAttr->getResourceKind(); - bool IsROV = FD->hasAttr(); llvm::hlsl::ElementType ET = calculateElementType(CGM.getContext(), Ty); BufferResBinding Binding(D->getAttr()); diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 9aacbe4ad9548..071e64fe56d48 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -13,10 +13,13 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Attr.h" #include "clang/AST/DeclCXX.h" +#include "clang/AST/Type.h" #include "clang/Basic/AttrKinds.h" #include "clang/Basic/HLSLRuntime.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Sema.h" +#include "clang/Sema/SemaHLSL.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Frontend/HLSL/HLSLResource.h" #include @@ -107,7 +110,7 @@ struct BuiltinTypeDeclBuilder { } BuiltinTypeDeclBuilder & - addHandleMember(ResourceClass RC, ResourceKind RK, bool IsROV, + addHandleMember(Sema &S, ResourceClass RC, ResourceKind RK, bool IsROV, AccessSpecifier Access = AccessSpecifier::AS_private) { if (Record->isCompleteDefinition()) return *this; @@ -118,16 +121,16 @@ struct BuiltinTypeDeclBuilder { Ty = Record->getASTContext().getPointerType( QualType(TTD->getTypeForDecl(), 0)); } - // add handle member - Attr *ResourceClassAttr = - HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC); + + // add handle member with resource type attributes + QualType AttributedResTy = QualType(); + SmallVector Attrs = { + HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC), + IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr}; Attr *ResourceAttr = HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK); - Attr *ROVAttr = - IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr; - addMemberVariable("h", Ty, {ResourceClassAttr, ResourceAttr, ROVAttr}, - Access); - + if (CreateHLSLAttributedResourceType(S, Ty, Attrs, AttributedResTy)) + addMemberVariable("h", AttributedResTy, {ResourceAttr}, Access); return *this; } @@ -494,7 +497,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S, ResourceClass RC, ResourceKind RK, bool IsROV) { return BuiltinTypeDeclBuilder(Decl) - .addHandleMember(RC, RK, IsROV) + .addHandleMember(S, RC, RK, IsROV) .addDefaultHandleConstructor(S, RC); } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 33547c2e6e145..d068cb6a78f26 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6907,12 +6907,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_HLSLResourceBinding: S.HLSL().handleResourceBindingAttr(D, AL); break; - case ParsedAttr::AT_HLSLROV: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_HLSLResourceClass: - S.HLSL().handleResourceClassAttr(D, AL); - break; case ParsedAttr::AT_HLSLParamModifier: S.HLSL().handleParamModifierAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index d5ccd3815eb66..3b40769939f12 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "clang/Sema/SemaHLSL.h" +#include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/AST/DeclBase.h" #include "clang/AST/DeclCXX.h" @@ -22,7 +23,9 @@ #include "clang/Sema/Initialization.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" +#include "clang/Sema/Template.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" @@ -30,6 +33,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include +#include using namespace clang; @@ -559,46 +563,125 @@ void SemaHLSL::handleShaderAttr(Decl *D, const ParsedAttr &AL) { D->addAttr(NewAttr); } -void SemaHLSL::handleResourceClassAttr(Decl *D, const ParsedAttr &AL) { - if (!AL.isArgIdent(0)) { - Diag(AL.getLoc(), diag::err_attribute_argument_type) - << AL << AANT_ArgumentIdentifier; - return; - } +bool clang::CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped, + ArrayRef AttrList, + QualType &ResType) { + assert(AttrList.size() && "expected list of resource attributes"); - IdentifierLoc *Loc = AL.getArgAsIdent(0); - StringRef Identifier = Loc->Ident->getName(); - SourceLocation ArgLoc = Loc->Loc; + QualType Contained = QualType(); + HLSLAttributedResourceType::Attributes ResAttrs = {}; - // Validate. - llvm::dxil::ResourceClass RC; - if (!HLSLResourceClassAttr::ConvertStrToResourceClass(Identifier, RC)) { - Diag(ArgLoc, diag::warn_attribute_type_not_supported) - << "ResourceClass" << Identifier; - return; + bool HasResourceClass = false; + for (const Attr *A : AttrList) { + if (!A) + continue; + switch (A->getKind()) { + case attr::HLSLResourceClass: { + llvm::dxil::ResourceClass RC = + cast(A)->getResourceClass(); + if (HasResourceClass) { + S.Diag(A->getLocation(), ResAttrs.ResourceClass == RC + ? diag::warn_duplicate_attribute_exact + : diag::warn_duplicate_attribute) + << A; + return false; + } + ResAttrs.ResourceClass = RC; + HasResourceClass = true; + break; + } + case attr::HLSLROV: + ResAttrs.IsROV = true; + break; + default: + llvm_unreachable("unhandled resource attribute type"); + } } - D->addAttr(HLSLResourceClassAttr::Create(getASTContext(), RC, ArgLoc)); + if (!HasResourceClass) { + S.Diag(AttrList.back()->getRange().getEnd(), + diag::err_hlsl_missing_resource_class); + return false; + } + + ResType = S.getASTContext().getHLSLAttributedResourceType(Wrapped, Contained, + ResAttrs); + return true; } -// Validates HLSL resource type attribute and adds it to the list to be -// processed into a single HLSLAttributedResourceType later on. -// Returns false if the attribute is invalid. +// Validates and creates an HLSL attribute that is applied as type attribute on +// HLSL resource. The attributes are collected in HLSLResourcesTypeAttrs and at +// the end of the declaration they are applied to the declaration type by +// wrapping it in HLSLAttributedResourceType. bool SemaHLSL::handleResourceTypeAttr(const ParsedAttr &AL) { - // FIXME: placeholder - not yet implemented + Attr *A = nullptr; + + // validate number of arguments + if (!AL.checkExactlyNumArgs(SemaRef, AL.getMinArgs())) + return false; + + switch (AL.getKind()) { + case ParsedAttr::AT_HLSLResourceClass: { + if (!AL.isArgIdent(0)) { + Diag(AL.getLoc(), diag::err_attribute_argument_type) + << AL << AANT_ArgumentIdentifier; + return false; + } + + IdentifierLoc *Loc = AL.getArgAsIdent(0); + StringRef Identifier = Loc->Ident->getName(); + SourceLocation ArgLoc = Loc->Loc; + + // Validate resource class value + llvm::dxil::ResourceClass RC; + if (!HLSLResourceClassAttr::ConvertStrToResourceClass(Identifier, RC)) { + Diag(ArgLoc, diag::warn_attribute_type_not_supported) + << "ResourceClass" << Identifier; + return false; + } + A = HLSLResourceClassAttr::Create(getASTContext(), RC, AL.getLoc()); + break; + } + case ParsedAttr::AT_HLSLROV: + A = HLSLROVAttr::Create(getASTContext(), AL.getLoc()); + break; + default: + llvm_unreachable("unhandled HLSL attribute"); + } + + HLSLResourcesTypeAttrs.emplace_back(A); return true; } -// Combines all resource type attributes and create HLSLAttributedResourceType. +// Combines all resource type attributes and creates HLSLAttributedResourceType. QualType SemaHLSL::ProcessResourceTypeAttributes(QualType CurrentType) { - // FIXME: placeholder - not yet implemented - return CurrentType; + if (!HLSLResourcesTypeAttrs.size()) + return CurrentType; + + QualType QT = CurrentType; + if (CreateHLSLAttributedResourceType(SemaRef, CurrentType, + HLSLResourcesTypeAttrs, QT)) { + const HLSLAttributedResourceType *RT = + dyn_cast(QT.getTypePtr()); + // Use the location of the first attribute as the location of the aggregated + // type. The attributes are stored in HLSLResourceTypeAttrs in the same + // order as they are parsed. + SourceLocation Loc = HLSLResourcesTypeAttrs[0]->getLoc(); + LocsForHLSLAttributedResources.insert(std::pair(RT, Loc)); + } + HLSLResourcesTypeAttrs.clear(); + return QT; } // Returns source location for the HLSLAttributedResourceType SourceLocation SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) { - // FIXME: placeholder - not yet implemented + auto I = LocsForHLSLAttributedResources.find(RT); + if (I != LocsForHLSLAttributedResources.end()) { + SourceLocation Loc = I->second; + LocsForHLSLAttributedResources.erase(I); + return Loc; + } return SourceLocation(); } @@ -656,33 +739,19 @@ static void updateResourceClassFlagsFromDeclResourceClass( } } -template -static const T *getSpecifiedHLSLAttrFromRecordDecl(RecordDecl *TheRecordDecl) { - if (!TheRecordDecl) - return nullptr; - - if (TheRecordDecl->hasAttr()) - return TheRecordDecl->getAttr(); - for (auto *FD : TheRecordDecl->fields()) { - const T *Attr = FD->getAttr(); - if (Attr) - return Attr; +const HLSLAttributedResourceType * +findAttributedResourceTypeOnField(VarDecl *VD) { + assert(VD != nullptr && "expected VarDecl"); + if (RecordDecl *RD = getRecordDeclFromVarDecl(VD)) { + for (auto *FD : RD->fields()) { + if (const HLSLAttributedResourceType *AttrResType = + dyn_cast(FD->getType().getTypePtr())) + return AttrResType; + } } return nullptr; } -template -static const T *getSpecifiedHLSLAttrFromVarDecl(VarDecl *VD) { - RecordDecl *TheRecordDecl = nullptr; - if (VD) { - TheRecordDecl = getRecordDeclFromVarDecl(VD); - if (!TheRecordDecl) - return nullptr; - } - - return getSpecifiedHLSLAttrFromRecordDecl(TheRecordDecl); -} - static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags, const RecordType *RT) { llvm::SmallVector TypesToScan; @@ -702,10 +771,11 @@ static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags, const RecordDecl *RD = RT->getDecl(); for (FieldDecl *FD : RD->fields()) { - if (HLSLResourceClassAttr *RCAttr = - FD->getAttr()) { + const Type *FieldTy = FD->getType().getTypePtr(); + if (const HLSLAttributedResourceType *AttrResType = + dyn_cast(FieldTy)) { updateResourceClassFlagsFromDeclResourceClass( - Flags, RCAttr->getResourceClass()); + Flags, AttrResType->getAttrs().ResourceClass); continue; } TypesToScan.emplace_back(FD->getType().getTypePtr()); @@ -732,11 +802,10 @@ static RegisterBindingFlags HLSLFillRegisterBindingFlags(Sema &S, } // Samplers, UAVs, and SRVs are VarDecl types else if (VarDecl *TheVarDecl = dyn_cast(TheDecl)) { - const HLSLResourceClassAttr *resClassAttr = - getSpecifiedHLSLAttrFromVarDecl(TheVarDecl); - if (resClassAttr) { + if (const HLSLAttributedResourceType *AttrResType = + findAttributedResourceTypeOnField(TheVarDecl)) { Flags.Resource = true; - Flags.ResourceClass = resClassAttr->getResourceClass(); + Flags.ResourceClass = AttrResType->getAttrs().ResourceClass; } else { const clang::Type *TheBaseType = TheVarDecl->getType().getTypePtr(); while (TheBaseType->isArrayType()) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 27eac401c28f5..0daf620b4123e 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7462,8 +7462,26 @@ QualType TreeTransform::TransformBTFTagAttributedType( template QualType TreeTransform::TransformHLSLAttributedResourceType( TypeLocBuilder &TLB, HLSLAttributedResourceTypeLoc TL) { - llvm_unreachable( - "Unexpected TreeTransform for HLSLAttributedResourceTypeLoc"); + + const HLSLAttributedResourceType *oldType = TL.getTypePtr(); + + QualType WrappedTy = getDerived().TransformType(TLB, TL.getWrappedLoc()); + if (WrappedTy.isNull()) + return QualType(); + + QualType ContainedTy = QualType(); + if (!oldType->getContainedType().isNull()) + ContainedTy = getDerived().TransformType(TLB, TL.getContainedLoc()); + + QualType Result = TL.getType(); + if (getDerived().AlwaysRebuild() || WrappedTy != oldType->getWrappedType() || + ContainedTy != oldType->getContainedType()) { + Result = SemaRef.Context.getHLSLAttributedResourceType( + WrappedTy, ContainedTy, oldType->getAttrs()); + } + + TLB.push(Result); + return Result; } template diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl index 1f6ef60e121ea..0e7803ce50a89 100644 --- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl @@ -30,8 +30,7 @@ RWBuffer Buffer; // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class RWBuffer definition // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'element_type *' -// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: implicit h 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' @@ -39,7 +38,7 @@ RWBuffer Buffer; // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const RWBuffer' lvalue implicit this // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline @@ -49,7 +48,7 @@ RWBuffer Buffer; // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'RWBuffer' lvalue implicit this // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline @@ -59,6 +58,5 @@ RWBuffer Buffer; // CHECK: TemplateArgument type 'float' // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit referenced h 'float *' -// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit referenced h 'float * {{\[\[}}hlsl::resource_class(UAV)]]':'float *' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index 5ebbd29b316bf..eca8633114902 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -82,8 +82,6 @@ // CHECK-NEXT: FunctionReturnThunks (SubjectMatchRule_function) // CHECK-NEXT: GNUInline (SubjectMatchRule_function) // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable) -// CHECK-NEXT: HLSLROV (SubjectMatchRule_record_not_is_union) -// CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_field) // CHECK-NEXT: Hot (SubjectMatchRule_function) // CHECK-NEXT: HybridPatchable (SubjectMatchRule_function) // CHECK-NEXT: IBAction (SubjectMatchRule_objc_method_is_instance) diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl index 29850828ad3bc..24c85c6ccf7d7 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl @@ -1,9 +1,16 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} -struct [[hlsl::is_rov]] Eg1 { - int i; +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; }; -Eg1 e1; +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:14:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; +} diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl index a21fed22220b6..68b2d9ecb190a 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -1,15 +1,16 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify -// expected-error@+1{{'is_rov' attribute takes no arguments}} -struct [[hlsl::is_rov(3)]] Eg1 { - int i; -}; +// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} +[[hlsl::is_rov()]] __hlsl_resource_t res0; -Eg1 e1; +// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} +__hlsl_resource_t [[hlsl::is_rov()]] res1; +// expected-error@+1{{'is_rov' attribute takes no arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; + // expected-error@+1{{use of undeclared identifier 'gibberish'}} -struct [[hlsl::is_rov(gibberish)]] Eg2 { - int i; -}; +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; -Eg2 e2; +// duplicate attribute with the same meaning - no error +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov()]] [[hlsl::is_rov()]] res4; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl index 4b002e2d89009..f11a64d33839b 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl @@ -1,32 +1,32 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} SRV -struct Eg1 { - [[hlsl::resource_class(SRV)]] int i; +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t' +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; }; -Eg1 e1; - -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:13:8 referenced struct Eg2 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} UAV -struct Eg2 { - [[hlsl::resource_class(UAV)]] int i; -}; -Eg2 e2; +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:49 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]]':'__hlsl_resource_t' +__hlsl_resource_t [[hlsl::resource_class(SRV)]] res; -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:20:8 referenced struct Eg3 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} CBuffer -struct Eg3 { - [[hlsl::resource_class(CBuffer)]] int i; -}; -Eg3 e3; +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:14:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:55 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]]':'__hlsl_resource_t' +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r; +} -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:27:8 referenced struct Eg4 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} Sampler -struct Eg4 { - [[hlsl::resource_class(Sampler)]] int i; +// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:23:29 MyBuffer2 +// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:19 typename depth 0 index 0 T +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:23:29 struct MyBuffer2 definition +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t' +template struct MyBuffer2 { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; }; -Eg4 e4; -RWBuffer In : register(u1); +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} line:23:29 struct MyBuffer2 definition implicit_instantiation +// CHECK: TemplateArgument type 'float' +// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t' +MyBuffer2 myBuffer2; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl index 76bed2f060783..01ff1c007e2b5 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl @@ -1,22 +1,19 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify -struct Eg1 { -// expected-error@+1{{'resource_class' attribute takes one argument}} - [[hlsl::resource_class()]] int i; -}; +// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}} +[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0; -Eg1 e1; +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class()]] e1; -struct Eg2 { // expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} - [[hlsl::resource_class(gibberish)]] int i; -}; +__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2; -Eg2 e2; +// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3; -// expected-warning@+1{{'resource_class' attribute only applies to non-static data members}} -struct [[hlsl::resource_class(SRV)]] Eg3 { - int i; -}; +// expected-warning@+1{{attribute 'resource_class' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4; -Eg3 e3; +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5; diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index 320d1160e761d..6324a11fc8a2d 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -1,15 +1,16 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'float *' -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit UAV +// CHECK: -TemplateArgument type 'float' +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'float * {{\[\[}}hlsl::resource_class(UAV)]]':'float *' // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer RWBuffer Buffer1; -// CHECK: -ClassTemplateDecl 0x{{[0-9a-f]+}} <> implicit RasterizerOrderedBuffer -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <> implicit class RasterizerOrderedBuffer definition -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h 'element_type *' -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit UAV +// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RasterizerOrderedBuffer definition implicit_instantiation +// CHECK: -TemplateArgument type 'vector' +// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'vector {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'vector' // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} <> Implicit -RasterizerOrderedBuffer > BufferArray3[4] : register(u4, space1); +RasterizerOrderedBuffer > BufferArray3[4]; diff --git a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl index 6a0b5956545dd..cb728dca838c3 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl @@ -2,7 +2,7 @@ template struct MyTemplatedSRV { - [[hlsl::resource_class(SRV)]] T x; + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; }; // valid, The register keyword in this statement isn't binding a resource, rather it is diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl index c40d1d7f60b34..4b6af47c0ab72 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl @@ -6,23 +6,23 @@ template struct MyTemplatedSRV { - [[hlsl::resource_class(SRV)]] T x; + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; }; struct MySRV { - [[hlsl::resource_class(SRV)]] int x; + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; }; struct MySampler { - [[hlsl::resource_class(Sampler)]] int x; + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; }; struct MyUAV { - [[hlsl::resource_class(UAV)]] int x; + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; }; struct MyCBuffer { - [[hlsl::resource_class(CBuffer)]] int x; + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; }; diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl index edb3f30739cdf..ea2d576e4cca5 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl @@ -2,23 +2,23 @@ template struct MyTemplatedUAV { - [[hlsl::resource_class(UAV)]] T x; + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; }; struct MySRV { - [[hlsl::resource_class(SRV)]] int x; + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; }; struct MySampler { - [[hlsl::resource_class(Sampler)]] int x; + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; }; struct MyUAV { - [[hlsl::resource_class(UAV)]] int x; + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; }; struct MyCBuffer { - [[hlsl::resource_class(CBuffer)]] int x; + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; }; // Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 From 77f1b481b884621c12cde5f2ce6f080f11dbbbcc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 6 Sep 2024 09:15:18 +0400 Subject: [PATCH 328/425] DAG: Lower single infinity is.fpclass tests to fcmp (#100380) InstCombine also should have taken care of this, but this should be helpful when the fcmp based lowering strategy tries to combine multiple tests. --- llvm/lib/CodeGen/CodeGenCommonISel.cpp | 2 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 16 ++++ llvm/test/CodeGen/X86/is_fpclass.ll | 92 ++++++++----------- 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp index d985751e2be0b..4cd2f6ae2fdb1 100644 --- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp +++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp @@ -202,6 +202,8 @@ FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp) { case fcSubnormal | fcZero | fcNan: return InvertedTest; case fcInf | fcNan: + case fcPosInf | fcNan: + case fcNegInf | fcNan: // If we're trying to use fcmp, we can take advantage of the nan check // behavior of the compare (but this is more instructions in the integer // expansion). diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c3affabb19d37..2b41b8a9a810e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8754,6 +8754,22 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, IsOrderedInf ? OrderedCmpOpcode : UnorderedCmpOpcode); } + if ((OrderedFPTestMask == fcPosInf || OrderedFPTestMask == fcNegInf) && + isCondCodeLegalOrCustom(IsOrdered ? OrderedCmpOpcode + : UnorderedCmpOpcode, + OperandVT.getSimpleVT())) { + // isposinf(x) --> x == inf + // isneginf(x) --> x == -inf + // isposinf(x) || nan --> x u== inf + // isneginf(x) || nan --> x u== -inf + + SDValue Inf = DAG.getConstantFP( + APFloat::getInf(Semantics, OrderedFPTestMask == fcNegInf), DL, + OperandVT); + return DAG.getSetCC(DL, ResultVT, Op, Inf, + IsOrdered ? OrderedCmpOpcode : UnorderedCmpOpcode); + } + if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) { // TODO: Could handle ordered case, but it produces worse code for // x86. Maybe handle ordered if fabs is free? diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index cc4d4c4543a51..97136dafa6c2c 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -2116,24 +2116,19 @@ entry: define i1 @is_plus_inf_or_nan_f(float %x) { ; X86-LABEL: is_plus_inf_or_nan_f: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: sete %cl -; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-NEXT: setge %al -; X86-NEXT: orb %cl, %al +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: is_plus_inf_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-NEXT: sete %cl -; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X64-NEXT: setge %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: sete %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 515) ; 0x200|0x3 = "+inf|nan" ret i1 %class @@ -2142,24 +2137,19 @@ define i1 @is_plus_inf_or_nan_f(float %x) { define i1 @is_minus_inf_or_nan_f(float %x) { ; X86-LABEL: is_minus_inf_or_nan_f: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X86-NEXT: sete %cl -; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-NEXT: setge %al -; X86-NEXT: orb %cl, %al +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: is_minus_inf_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X64-NEXT: sete %cl -; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X64-NEXT: setge %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: sete %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 7) ; "-inf|nan" ret i1 %class @@ -2168,24 +2158,19 @@ define i1 @is_minus_inf_or_nan_f(float %x) { define i1 @not_is_plus_inf_or_nan_f(float %x) { ; X86-LABEL: not_is_plus_inf_or_nan_f: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X86-NEXT: sete %cl -; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: setl %al -; X86-NEXT: orb %cl, %al +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: not_is_plus_inf_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X64-NEXT: sete %cl -; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-NEXT: setl %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: setne %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 508) ; ~(0x200|0x3) = "~(+inf|nan)" ret i1 %class @@ -2194,24 +2179,19 @@ define i1 @not_is_plus_inf_or_nan_f(float %x) { define i1 @not_is_minus_inf_or_nan_f(float %x) { ; X86-LABEL: not_is_minus_inf_or_nan_f: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: sete %cl -; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: setl %al -; X86-NEXT: orb %cl, %al +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: not_is_minus_inf_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-NEXT: sete %cl -; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-NEXT: setl %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: setne %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1016) ; "~(-inf|nan)" ret i1 %class From 093b8bfe6b64c916647ae64af8066df22bb6ea65 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 5 Sep 2024 22:29:23 -0700 Subject: [PATCH 329/425] [RISCV] Separate the calling convention handlers into their own file. NFC (#107484) These are used by both SelectionDAG and GlobalISel and are separate from RISCVTargetLowering. Having a separate file is how other targets are structured. Though other targets generate most of their calling convention code through tablegen. I moved the `CC_RISV` functions from the `llvm::RISCV` namespace to `llvm::`. That's what the tablegen code on other targets does and the functions already have RISCV in their name. `RISCVCCAssignFn` is moved from `RISCVTargetLowering` to the `llvm` namespace. --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + .../Target/RISCV/GISel/RISCVCallLowering.cpp | 29 +- llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 684 +++++++++++++++++ llvm/lib/Target/RISCV/RISCVCallingConv.h | 51 ++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 691 +----------------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 31 +- 6 files changed, 762 insertions(+), 725 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVCallingConv.cpp create mode 100644 llvm/lib/Target/RISCV/RISCVCallingConv.h diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index cbb4c2cedfb97..aef0e4fbbf584 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -29,6 +29,7 @@ add_public_tablegen_target(RISCVCommonTableGen) add_llvm_target(RISCVCodeGen RISCVAsmPrinter.cpp + RISCVCallingConv.cpp RISCVCodeGenPrepare.cpp RISCVDeadRegisterDefinitions.cpp RISCVMakeCompressible.cpp diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 31a9df53a2aa1..14832204058f8 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "RISCVCallLowering.h" +#include "RISCVCallingConv.h" #include "RISCVISelLowering.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVSubtarget.h" @@ -30,14 +31,13 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { // The function used internally to assign args - we ignore the AssignFn stored // by OutgoingValueAssigner since RISC-V implements its CC using a custom // function with a different signature. - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn; + RISCVCCAssignFn *RISCVAssignFn; // Whether this is assigning args for a return. bool IsRet; public: - RISCVOutgoingValueAssigner( - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) + RISCVOutgoingValueAssigner(RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) : CallLowering::OutgoingValueAssigner(nullptr), RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {} @@ -182,14 +182,13 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { // The function used internally to assign args - we ignore the AssignFn stored // by IncomingValueAssigner since RISC-V implements its CC using a custom // function with a different signature. - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn; + RISCVCCAssignFn *RISCVAssignFn; // Whether this is assigning args from a return. bool IsRet; public: - RISCVIncomingValueAssigner( - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) + RISCVIncomingValueAssigner(RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) : CallLowering::IncomingValueAssigner(nullptr), RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {} @@ -425,7 +424,7 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, CC); RISCVOutgoingValueAssigner Assigner( - CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, + CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV, /*IsRet=*/true); RISCVOutgoingValueHandler Handler(MIRBuilder, MF.getRegInfo(), Ret); if (!determineAndHandleAssignments(Handler, Assigner, SplitRetInfos, @@ -461,9 +460,9 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); - if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full, - Outs[I].Flags[0], CCInfo, /*IsFixed=*/true, - /*isRet=*/true, nullptr, TLI)) + if (CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full, + Outs[I].Flags[0], CCInfo, /*IsFixed=*/true, + /*isRet=*/true, nullptr, TLI)) return false; } return true; @@ -576,9 +575,9 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, ++Index; } - RISCVIncomingValueAssigner Assigner( - CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/false); + RISCVIncomingValueAssigner Assigner(CC == CallingConv::Fast ? CC_RISCV_FastCC + : CC_RISCV, + /*IsRet=*/false); RISCVFormalArgHandler Handler(MIRBuilder, MF.getRegInfo()); SmallVector ArgLocs; @@ -639,7 +638,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, Call.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); RISCVOutgoingValueAssigner ArgAssigner( - CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, + CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV, /*IsRet=*/false); RISCVOutgoingValueHandler ArgHandler(MIRBuilder, MF.getRegInfo(), Call); if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgInfos, @@ -667,7 +666,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC); RISCVIncomingValueAssigner RetAssigner( - CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, + CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV, /*IsRet=*/true); RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call); if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos, diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp new file mode 100644 index 0000000000000..bf6ae2d1c2910 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -0,0 +1,684 @@ +//===-- RISCVCallingConv.cpp - RISC-V Custom CC Routines ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the RISC-V Calling Convention. +// +//===----------------------------------------------------------------------===// + +#include "RISCVCallingConv.h" +#include "RISCVSubtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCRegister.h" + +using namespace llvm; + +// Calling Convention Implementation. +// The expectations for frontend ABI lowering vary from target to target. +// Ideally, an LLVM frontend would be able to avoid worrying about many ABI +// details, but this is a longer term goal. For now, we simply try to keep the +// role of the frontend as simple and well-defined as possible. The rules can +// be summarised as: +// * Never split up large scalar arguments. We handle them here. +// * If a hardfloat calling convention is being used, and the struct may be +// passed in a pair of registers (fp+fp, int+fp), and both registers are +// available, then pass as two separate arguments. If either the GPRs or FPRs +// are exhausted, then pass according to the rule below. +// * If a struct could never be passed in registers or directly in a stack +// slot (as it is larger than 2*XLEN and the floating point rules don't +// apply), then pass it using a pointer with the byval attribute. +// * If a struct is less than 2*XLEN, then coerce to either a two-element +// word-sized array or a 2*XLEN scalar (depending on alignment). +// * The frontend can determine whether a struct is returned by reference or +// not based on its size and fields. If it will be returned by reference, the +// frontend must modify the prototype so a pointer with the sret annotation is +// passed as the first argument. This is not necessary for large scalar +// returns. +// * Struct return values and varargs should be coerced to structs containing +// register-size fields in the same situations they would be for fixed +// arguments. + +static const MCPhysReg ArgFPR16s[] = {RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, + RISCV::F13_H, RISCV::F14_H, RISCV::F15_H, + RISCV::F16_H, RISCV::F17_H}; +static const MCPhysReg ArgFPR32s[] = {RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, + RISCV::F13_F, RISCV::F14_F, RISCV::F15_F, + RISCV::F16_F, RISCV::F17_F}; +static const MCPhysReg ArgFPR64s[] = {RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, + RISCV::F13_D, RISCV::F14_D, RISCV::F15_D, + RISCV::F16_D, RISCV::F17_D}; +// This is an interim calling convention and it may be changed in the future. +static const MCPhysReg ArgVRs[] = { + RISCV::V8, RISCV::V9, RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13, + RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19, + RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23}; +static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2, RISCV::V10M2, RISCV::V12M2, + RISCV::V14M2, RISCV::V16M2, RISCV::V18M2, + RISCV::V20M2, RISCV::V22M2}; +static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4, + RISCV::V20M4}; +static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8}; +static const MCPhysReg ArgVRN2M1s[] = { + RISCV::V8_V9, RISCV::V9_V10, RISCV::V10_V11, RISCV::V11_V12, + RISCV::V12_V13, RISCV::V13_V14, RISCV::V14_V15, RISCV::V15_V16, + RISCV::V16_V17, RISCV::V17_V18, RISCV::V18_V19, RISCV::V19_V20, + RISCV::V20_V21, RISCV::V21_V22, RISCV::V22_V23}; +static const MCPhysReg ArgVRN3M1s[] = { + RISCV::V8_V9_V10, RISCV::V9_V10_V11, RISCV::V10_V11_V12, + RISCV::V11_V12_V13, RISCV::V12_V13_V14, RISCV::V13_V14_V15, + RISCV::V14_V15_V16, RISCV::V15_V16_V17, RISCV::V16_V17_V18, + RISCV::V17_V18_V19, RISCV::V18_V19_V20, RISCV::V19_V20_V21, + RISCV::V20_V21_V22, RISCV::V21_V22_V23}; +static const MCPhysReg ArgVRN4M1s[] = { + RISCV::V8_V9_V10_V11, RISCV::V9_V10_V11_V12, RISCV::V10_V11_V12_V13, + RISCV::V11_V12_V13_V14, RISCV::V12_V13_V14_V15, RISCV::V13_V14_V15_V16, + RISCV::V14_V15_V16_V17, RISCV::V15_V16_V17_V18, RISCV::V16_V17_V18_V19, + RISCV::V17_V18_V19_V20, RISCV::V18_V19_V20_V21, RISCV::V19_V20_V21_V22, + RISCV::V20_V21_V22_V23}; +static const MCPhysReg ArgVRN5M1s[] = { + RISCV::V8_V9_V10_V11_V12, RISCV::V9_V10_V11_V12_V13, + RISCV::V10_V11_V12_V13_V14, RISCV::V11_V12_V13_V14_V15, + RISCV::V12_V13_V14_V15_V16, RISCV::V13_V14_V15_V16_V17, + RISCV::V14_V15_V16_V17_V18, RISCV::V15_V16_V17_V18_V19, + RISCV::V16_V17_V18_V19_V20, RISCV::V17_V18_V19_V20_V21, + RISCV::V18_V19_V20_V21_V22, RISCV::V19_V20_V21_V22_V23}; +static const MCPhysReg ArgVRN6M1s[] = { + RISCV::V8_V9_V10_V11_V12_V13, RISCV::V9_V10_V11_V12_V13_V14, + RISCV::V10_V11_V12_V13_V14_V15, RISCV::V11_V12_V13_V14_V15_V16, + RISCV::V12_V13_V14_V15_V16_V17, RISCV::V13_V14_V15_V16_V17_V18, + RISCV::V14_V15_V16_V17_V18_V19, RISCV::V15_V16_V17_V18_V19_V20, + RISCV::V16_V17_V18_V19_V20_V21, RISCV::V17_V18_V19_V20_V21_V22, + RISCV::V18_V19_V20_V21_V22_V23}; +static const MCPhysReg ArgVRN7M1s[] = { + RISCV::V8_V9_V10_V11_V12_V13_V14, RISCV::V9_V10_V11_V12_V13_V14_V15, + RISCV::V10_V11_V12_V13_V14_V15_V16, RISCV::V11_V12_V13_V14_V15_V16_V17, + RISCV::V12_V13_V14_V15_V16_V17_V18, RISCV::V13_V14_V15_V16_V17_V18_V19, + RISCV::V14_V15_V16_V17_V18_V19_V20, RISCV::V15_V16_V17_V18_V19_V20_V21, + RISCV::V16_V17_V18_V19_V20_V21_V22, RISCV::V17_V18_V19_V20_V21_V22_V23}; +static const MCPhysReg ArgVRN8M1s[] = {RISCV::V8_V9_V10_V11_V12_V13_V14_V15, + RISCV::V9_V10_V11_V12_V13_V14_V15_V16, + RISCV::V10_V11_V12_V13_V14_V15_V16_V17, + RISCV::V11_V12_V13_V14_V15_V16_V17_V18, + RISCV::V12_V13_V14_V15_V16_V17_V18_V19, + RISCV::V13_V14_V15_V16_V17_V18_V19_V20, + RISCV::V14_V15_V16_V17_V18_V19_V20_V21, + RISCV::V15_V16_V17_V18_V19_V20_V21_V22, + RISCV::V16_V17_V18_V19_V20_V21_V22_V23}; +static const MCPhysReg ArgVRN2M2s[] = {RISCV::V8M2_V10M2, RISCV::V10M2_V12M2, + RISCV::V12M2_V14M2, RISCV::V14M2_V16M2, + RISCV::V16M2_V18M2, RISCV::V18M2_V20M2, + RISCV::V20M2_V22M2}; +static const MCPhysReg ArgVRN3M2s[] = { + RISCV::V8M2_V10M2_V12M2, RISCV::V10M2_V12M2_V14M2, + RISCV::V12M2_V14M2_V16M2, RISCV::V14M2_V16M2_V18M2, + RISCV::V16M2_V18M2_V20M2, RISCV::V18M2_V20M2_V22M2}; +static const MCPhysReg ArgVRN4M2s[] = { + RISCV::V8M2_V10M2_V12M2_V14M2, RISCV::V10M2_V12M2_V14M2_V16M2, + RISCV::V12M2_V14M2_V16M2_V18M2, RISCV::V14M2_V16M2_V18M2_V20M2, + RISCV::V16M2_V18M2_V20M2_V22M2}; +static const MCPhysReg ArgVRN2M4s[] = {RISCV::V8M4_V12M4, RISCV::V12M4_V16M4, + RISCV::V16M4_V20M4}; + +ArrayRef RISCV::getArgGPRs(const RISCVABI::ABI ABI) { + // The GPRs used for passing arguments in the ILP32* and LP64* ABIs, except + // the ILP32E ABI. + static const MCPhysReg ArgIGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, + RISCV::X13, RISCV::X14, RISCV::X15, + RISCV::X16, RISCV::X17}; + // The GPRs used for passing arguments in the ILP32E/ILP64E ABI. + static const MCPhysReg ArgEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, + RISCV::X13, RISCV::X14, RISCV::X15}; + + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return ArrayRef(ArgEGPRs); + + return ArrayRef(ArgIGPRs); +} + +static ArrayRef getFastCCArgGPRs(const RISCVABI::ABI ABI) { + // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used + // for save-restore libcall, so we don't use them. + // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register. + static const MCPhysReg FastCCIGPRs[] = { + RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, + RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31}; + + // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E. + static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, + RISCV::X13, RISCV::X14, RISCV::X15}; + + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return ArrayRef(FastCCEGPRs); + + return ArrayRef(FastCCIGPRs); +} + +// Pass a 2*XLEN argument that has been split into two XLEN values through +// registers or the stack as necessary. +static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, + ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2, + MVT ValVT2, MVT LocVT2, + ISD::ArgFlagsTy ArgFlags2, bool EABI) { + unsigned XLenInBytes = XLen / 8; + const RISCVSubtarget &STI = + State.getMachineFunction().getSubtarget(); + ArrayRef ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI()); + + if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { + // At least one half can be passed via register. + State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, + VA1.getLocVT(), CCValAssign::Full)); + } else { + // Both halves must be passed on the stack, with proper alignment. + // TODO: To be compatible with GCC's behaviors, we force them to have 4-byte + // alignment. This behavior may be changed when RV32E/ILP32E is ratified. + Align StackAlign(XLenInBytes); + if (!EABI || XLen != 32) + StackAlign = std::max(StackAlign, ArgFlags1.getNonZeroOrigAlign()); + State.addLoc( + CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(), + State.AllocateStack(XLenInBytes, StackAlign), + VA1.getLocVT(), CCValAssign::Full)); + State.addLoc(CCValAssign::getMem( + ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)), + LocVT2, CCValAssign::Full)); + return false; + } + + if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { + // The second half can also be passed via register. + State.addLoc( + CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); + } else { + // The second half is passed via the stack, without additional alignment. + State.addLoc(CCValAssign::getMem( + ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)), + LocVT2, CCValAssign::Full)); + } + + return false; +} + +static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, + const RISCVTargetLowering &TLI) { + const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); + if (RC == &RISCV::VRRegClass) { + // Assign the first mask argument to V0. + // This is an interim calling convention and it may be changed in the + // future. + if (ValVT.getVectorElementType() == MVT::i1) + if (MCRegister Reg = State.AllocateReg(RISCV::V0)) + return Reg; + return State.AllocateReg(ArgVRs); + } + if (RC == &RISCV::VRM2RegClass) + return State.AllocateReg(ArgVRM2s); + if (RC == &RISCV::VRM4RegClass) + return State.AllocateReg(ArgVRM4s); + if (RC == &RISCV::VRM8RegClass) + return State.AllocateReg(ArgVRM8s); + if (RC == &RISCV::VRN2M1RegClass) + return State.AllocateReg(ArgVRN2M1s); + if (RC == &RISCV::VRN3M1RegClass) + return State.AllocateReg(ArgVRN3M1s); + if (RC == &RISCV::VRN4M1RegClass) + return State.AllocateReg(ArgVRN4M1s); + if (RC == &RISCV::VRN5M1RegClass) + return State.AllocateReg(ArgVRN5M1s); + if (RC == &RISCV::VRN6M1RegClass) + return State.AllocateReg(ArgVRN6M1s); + if (RC == &RISCV::VRN7M1RegClass) + return State.AllocateReg(ArgVRN7M1s); + if (RC == &RISCV::VRN8M1RegClass) + return State.AllocateReg(ArgVRN8M1s); + if (RC == &RISCV::VRN2M2RegClass) + return State.AllocateReg(ArgVRN2M2s); + if (RC == &RISCV::VRN3M2RegClass) + return State.AllocateReg(ArgVRN3M2s); + if (RC == &RISCV::VRN4M2RegClass) + return State.AllocateReg(ArgVRN4M2s); + if (RC == &RISCV::VRN2M4RegClass) + return State.AllocateReg(ArgVRN2M4s); + llvm_unreachable("Unhandled register class for ValueType"); +} + +// Implements the RISC-V calling convention. Returns true upon failure. +bool llvm::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, + MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) { + unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); + assert(XLen == 32 || XLen == 64); + MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; + + // Static chain parameter must not be passed in normal argument registers, + // so we assign t2 for it as done in GCC's __builtin_call_with_static_chain + if (ArgFlags.isNest()) { + if (MCRegister Reg = State.AllocateReg(RISCV::X7)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + // Any return value split in to more than two values can't be returned + // directly. Vectors are returned via the available vector registers. + if (!LocVT.isVector() && IsRet && ValNo > 1) + return true; + + // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a + // variadic argument, or if no F16/F32 argument registers are available. + bool UseGPRForF16_F32 = true; + // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a + // variadic argument, or if no F64 argument registers are available. + bool UseGPRForF64 = true; + + switch (ABI) { + default: + llvm_unreachable("Unexpected ABI"); + case RISCVABI::ABI_ILP32: + case RISCVABI::ABI_ILP32E: + case RISCVABI::ABI_LP64: + case RISCVABI::ABI_LP64E: + break; + case RISCVABI::ABI_ILP32F: + case RISCVABI::ABI_LP64F: + UseGPRForF16_F32 = !IsFixed; + break; + case RISCVABI::ABI_ILP32D: + case RISCVABI::ABI_LP64D: + UseGPRForF16_F32 = !IsFixed; + UseGPRForF64 = !IsFixed; + break; + } + + // FPR16, FPR32, and FPR64 alias each other. + if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) { + UseGPRForF16_F32 = true; + UseGPRForF64 = true; + } + + // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and + // similar local variables rather than directly checking against the target + // ABI. + + ArrayRef ArgGPRs = RISCV::getArgGPRs(ABI); + + if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 || + (ValVT == MVT::f32 && XLen == 64))) { + MCRegister Reg = State.AllocateReg(ArgGPRs); + if (Reg) { + LocVT = XLenVT; + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (UseGPRForF16_F32 && + (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) { + LocVT = XLenVT; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; + } + + // If this is a variadic argument, the RISC-V calling convention requires + // that it is assigned an 'even' or 'aligned' register if it has 8-byte + // alignment (RV32) or 16-byte alignment (RV64). An aligned register should + // be used regardless of whether the original argument was split during + // legalisation or not. The argument will not be passed by registers if the + // original type is larger than 2*XLEN, so the register alignment rule does + // not apply. + // TODO: To be compatible with GCC's behaviors, we don't align registers + // currently if we are using ILP32E calling convention. This behavior may be + // changed when RV32E/ILP32E is ratified. + unsigned TwoXLenInBytes = (2 * XLen) / 8; + if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && + DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes && + ABI != RISCVABI::ABI_ILP32E) { + unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); + // Skip 'odd' register if necessary. + if (RegIdx != std::size(ArgGPRs) && RegIdx % 2 == 1) + State.AllocateReg(ArgGPRs); + } + + SmallVectorImpl &PendingLocs = State.getPendingLocs(); + SmallVectorImpl &PendingArgFlags = + State.getPendingArgFlags(); + + assert(PendingLocs.size() == PendingArgFlags.size() && + "PendingLocs and PendingArgFlags out of sync"); + + // Handle passing f64 on RV32D with a soft float ABI or when floating point + // registers are exhausted. + if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) { + assert(PendingLocs.empty() && "Can't lower f64 if it is split"); + // Depending on available argument GPRS, f64 may be passed in a pair of + // GPRs, split between a GPR and the stack, or passed completely on the + // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these + // cases. + MCRegister Reg = State.AllocateReg(ArgGPRs); + if (!Reg) { + unsigned StackOffset = State.AllocateStack(8, Align(8)); + State.addLoc( + CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + return false; + } + LocVT = MVT::i32; + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + MCRegister HiReg = State.AllocateReg(ArgGPRs); + if (HiReg) { + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo)); + } else { + unsigned StackOffset = State.AllocateStack(4, Align(4)); + State.addLoc( + CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + } + return false; + } + + // Fixed-length vectors are located in the corresponding scalable-vector + // container types. + if (ValVT.isFixedLengthVector()) + LocVT = TLI.getContainerForFixedLengthVector(LocVT); + + // Split arguments might be passed indirectly, so keep track of the pending + // values. Split vectors are passed via a mix of registers and indirectly, so + // treat them as we would any other argument. + if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) { + LocVT = XLenVT; + LocInfo = CCValAssign::Indirect; + PendingLocs.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + PendingArgFlags.push_back(ArgFlags); + if (!ArgFlags.isSplitEnd()) { + return false; + } + } + + // If the split argument only had two elements, it should be passed directly + // in registers or on the stack. + if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() && + PendingLocs.size() <= 2) { + assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); + // Apply the normal calling convention rules to the first half of the + // split argument. + CCValAssign VA = PendingLocs[0]; + ISD::ArgFlagsTy AF = PendingArgFlags[0]; + PendingLocs.clear(); + PendingArgFlags.clear(); + return CC_RISCVAssign2XLen( + XLen, State, VA, AF, ValNo, ValVT, LocVT, ArgFlags, + ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E); + } + + // Allocate to a register if possible, or else a stack slot. + MCRegister Reg; + unsigned StoreSizeBytes = XLen / 8; + Align StackAlign = Align(XLen / 8); + + if ((ValVT == MVT::f16 || ValVT == MVT::bf16) && !UseGPRForF16_F32) + Reg = State.AllocateReg(ArgFPR16s); + else if (ValVT == MVT::f32 && !UseGPRForF16_F32) + Reg = State.AllocateReg(ArgFPR32s); + else if (ValVT == MVT::f64 && !UseGPRForF64) + Reg = State.AllocateReg(ArgFPR64s); + else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) { + Reg = allocateRVVReg(ValVT, ValNo, State, TLI); + if (!Reg) { + // For return values, the vector must be passed fully via registers or + // via the stack. + // FIXME: The proposed vector ABI only mandates v8-v15 for return values, + // but we're using all of them. + if (IsRet) + return true; + // Try using a GPR to pass the address + if ((Reg = State.AllocateReg(ArgGPRs))) { + LocVT = XLenVT; + LocInfo = CCValAssign::Indirect; + } else if (ValVT.isScalableVector()) { + LocVT = XLenVT; + LocInfo = CCValAssign::Indirect; + } else { + // Pass fixed-length vectors on the stack. + LocVT = ValVT; + StoreSizeBytes = ValVT.getStoreSize(); + // Align vectors to their element sizes, being careful for vXi1 + // vectors. + StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne(); + } + } + } else { + Reg = State.AllocateReg(ArgGPRs); + } + + unsigned StackOffset = + Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign); + + // If we reach this point and PendingLocs is non-empty, we must be at the + // end of a split argument that must be passed indirectly. + if (!PendingLocs.empty()) { + assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()"); + assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()"); + + for (auto &It : PendingLocs) { + if (Reg) + It.convertToReg(Reg); + else + It.convertToMem(StackOffset); + State.addLoc(It); + } + PendingLocs.clear(); + PendingArgFlags.clear(); + return false; + } + + assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT || + (TLI.getSubtarget().hasVInstructions() && + (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) && + "Expected an XLenVT or vector types at this stage"); + + if (Reg) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + + // When a scalar floating-point value is passed on the stack, no + // bit-conversion is needed. + if (ValVT.isFloatingPoint() && LocInfo != CCValAssign::Indirect) { + assert(!ValVT.isVector()); + LocVT = ValVT; + LocInfo = CCValAssign::Full; + } + State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + return false; +} + +// FastCC has less than 1% performance improvement for some particular +// benchmark. But theoretically, it may have benefit for some cases. +bool llvm::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, + unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed, bool IsRet, Type *OrigTy, + const RISCVTargetLowering &TLI) { + if (LocVT == MVT::i32 || LocVT == MVT::i64) { + if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + const RISCVSubtarget &Subtarget = TLI.getSubtarget(); + + if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) { + static const MCPhysReg FPR16List[] = { + RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H, + RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H, RISCV::F1_H, + RISCV::F2_H, RISCV::F3_H, RISCV::F4_H, RISCV::F5_H, RISCV::F6_H, + RISCV::F7_H, RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H}; + if (MCRegister Reg = State.AllocateReg(FPR16List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) { + static const MCPhysReg FPR32List[] = { + RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F, + RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F, + RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F, + RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F}; + if (MCRegister Reg = State.AllocateReg(FPR32List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) { + static const MCPhysReg FPR64List[] = { + RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D, + RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D, + RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D, + RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D}; + if (MCRegister Reg = State.AllocateReg(FPR64List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + // Check if there is an available GPR before hitting the stack. + if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) || + (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) || + (LocVT == MVT::f64 && Subtarget.is64Bit() && + Subtarget.hasStdExtZdinx())) { + if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { + if (LocVT.getSizeInBits() != Subtarget.getXLen()) { + LocVT = Subtarget.getXLenVT(); + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + LocVT = Subtarget.getXLenVT(); + LocInfo = CCValAssign::BCvt; + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f16) { + unsigned Offset2 = State.AllocateStack(2, Align(2)); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset2, LocVT, LocInfo)); + return false; + } + + if (LocVT == MVT::i32 || LocVT == MVT::f32) { + unsigned Offset4 = State.AllocateStack(4, Align(4)); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo)); + return false; + } + + if (LocVT == MVT::i64 || LocVT == MVT::f64) { + unsigned Offset5 = State.AllocateStack(8, Align(8)); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo)); + return false; + } + + if (LocVT.isVector()) { + if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) { + // Fixed-length vectors are located in the corresponding scalable-vector + // container types. + if (ValVT.isFixedLengthVector()) + LocVT = TLI.getContainerForFixedLengthVector(LocVT); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + } else { + // Try and pass the address via a "fast" GPR. + if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) { + LocInfo = CCValAssign::Indirect; + LocVT = TLI.getSubtarget().getXLenVT(); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo)); + } else if (ValVT.isFixedLengthVector()) { + auto StackAlign = + MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne(); + unsigned StackOffset = + State.AllocateStack(ValVT.getStoreSize(), StackAlign); + State.addLoc( + CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + } else { + // Can't pass scalable vectors on the stack. + return true; + } + } + + return false; + } + + return true; // CC didn't match. +} + +bool llvm::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State) { + if (ArgFlags.isNest()) { + report_fatal_error( + "Attribute 'nest' is not supported in GHC calling convention"); + } + + static const MCPhysReg GPRList[] = { + RISCV::X9, RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22, + RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27}; + + if (LocVT == MVT::i32 || LocVT == MVT::i64) { + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim + // s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 + if (MCRegister Reg = State.AllocateReg(GPRList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + const RISCVSubtarget &Subtarget = + State.getMachineFunction().getSubtarget(); + + if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) { + // Pass in STG registers: F1, ..., F6 + // fs0 ... fs5 + static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F, + RISCV::F18_F, RISCV::F19_F, + RISCV::F20_F, RISCV::F21_F}; + if (MCRegister Reg = State.AllocateReg(FPR32List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) { + // Pass in STG registers: D1, ..., D6 + // fs6 ... fs11 + static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D, + RISCV::F24_D, RISCV::F25_D, + RISCV::F26_D, RISCV::F27_D}; + if (MCRegister Reg = State.AllocateReg(FPR64List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) || + (LocVT == MVT::f64 && Subtarget.hasStdExtZdinx() && + Subtarget.is64Bit())) { + if (MCRegister Reg = State.AllocateReg(GPRList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + report_fatal_error("No registers left in GHC calling convention"); + return true; +} diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h new file mode 100644 index 0000000000000..9154a31d11608 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h @@ -0,0 +1,51 @@ +//===-- RISCVCallingConv.h - RISC-V Custom CC Routines ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the custom routines for the RISC-V Calling Convention. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/RISCVBaseInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" + +namespace llvm { + +class DataLayout; +class RISCVTargetLowering; + +/// RISCVCCAssignFn - This target-specific function extends the default +/// CCValAssign with additional information used to lower RISC-V calling +/// conventions. +typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI, + unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed, bool IsRet, Type *OrigTy, + const RISCVTargetLowering &TLI); + +bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, + MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); + +bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, + MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, + bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); + +bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); + +namespace RISCV { + +ArrayRef getArgGPRs(const RISCVABI::ABI ABI); + +} // end namespace RISCV + +} // end namespace llvm diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6b4219b462384..acee6443bc452 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18934,492 +18934,6 @@ void RISCVTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineOperand::CreateReg(RISCV::FRM, /*isDef*/ false, /*isImp*/ true)); } -// Calling Convention Implementation. -// The expectations for frontend ABI lowering vary from target to target. -// Ideally, an LLVM frontend would be able to avoid worrying about many ABI -// details, but this is a longer term goal. For now, we simply try to keep the -// role of the frontend as simple and well-defined as possible. The rules can -// be summarised as: -// * Never split up large scalar arguments. We handle them here. -// * If a hardfloat calling convention is being used, and the struct may be -// passed in a pair of registers (fp+fp, int+fp), and both registers are -// available, then pass as two separate arguments. If either the GPRs or FPRs -// are exhausted, then pass according to the rule below. -// * If a struct could never be passed in registers or directly in a stack -// slot (as it is larger than 2*XLEN and the floating point rules don't -// apply), then pass it using a pointer with the byval attribute. -// * If a struct is less than 2*XLEN, then coerce to either a two-element -// word-sized array or a 2*XLEN scalar (depending on alignment). -// * The frontend can determine whether a struct is returned by reference or -// not based on its size and fields. If it will be returned by reference, the -// frontend must modify the prototype so a pointer with the sret annotation is -// passed as the first argument. This is not necessary for large scalar -// returns. -// * Struct return values and varargs should be coerced to structs containing -// register-size fields in the same situations they would be for fixed -// arguments. - -static const MCPhysReg ArgFPR16s[] = { - RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, - RISCV::F14_H, RISCV::F15_H, RISCV::F16_H, RISCV::F17_H -}; -static const MCPhysReg ArgFPR32s[] = { - RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, - RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F -}; -static const MCPhysReg ArgFPR64s[] = { - RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, - RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D -}; -// This is an interim calling convention and it may be changed in the future. -static const MCPhysReg ArgVRs[] = { - RISCV::V8, RISCV::V9, RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13, - RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19, - RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23}; -static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2, RISCV::V10M2, RISCV::V12M2, - RISCV::V14M2, RISCV::V16M2, RISCV::V18M2, - RISCV::V20M2, RISCV::V22M2}; -static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4, - RISCV::V20M4}; -static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8}; -static const MCPhysReg ArgVRN2M1s[] = { - RISCV::V8_V9, RISCV::V9_V10, RISCV::V10_V11, RISCV::V11_V12, - RISCV::V12_V13, RISCV::V13_V14, RISCV::V14_V15, RISCV::V15_V16, - RISCV::V16_V17, RISCV::V17_V18, RISCV::V18_V19, RISCV::V19_V20, - RISCV::V20_V21, RISCV::V21_V22, RISCV::V22_V23}; -static const MCPhysReg ArgVRN3M1s[] = { - RISCV::V8_V9_V10, RISCV::V9_V10_V11, RISCV::V10_V11_V12, - RISCV::V11_V12_V13, RISCV::V12_V13_V14, RISCV::V13_V14_V15, - RISCV::V14_V15_V16, RISCV::V15_V16_V17, RISCV::V16_V17_V18, - RISCV::V17_V18_V19, RISCV::V18_V19_V20, RISCV::V19_V20_V21, - RISCV::V20_V21_V22, RISCV::V21_V22_V23}; -static const MCPhysReg ArgVRN4M1s[] = { - RISCV::V8_V9_V10_V11, RISCV::V9_V10_V11_V12, RISCV::V10_V11_V12_V13, - RISCV::V11_V12_V13_V14, RISCV::V12_V13_V14_V15, RISCV::V13_V14_V15_V16, - RISCV::V14_V15_V16_V17, RISCV::V15_V16_V17_V18, RISCV::V16_V17_V18_V19, - RISCV::V17_V18_V19_V20, RISCV::V18_V19_V20_V21, RISCV::V19_V20_V21_V22, - RISCV::V20_V21_V22_V23}; -static const MCPhysReg ArgVRN5M1s[] = { - RISCV::V8_V9_V10_V11_V12, RISCV::V9_V10_V11_V12_V13, - RISCV::V10_V11_V12_V13_V14, RISCV::V11_V12_V13_V14_V15, - RISCV::V12_V13_V14_V15_V16, RISCV::V13_V14_V15_V16_V17, - RISCV::V14_V15_V16_V17_V18, RISCV::V15_V16_V17_V18_V19, - RISCV::V16_V17_V18_V19_V20, RISCV::V17_V18_V19_V20_V21, - RISCV::V18_V19_V20_V21_V22, RISCV::V19_V20_V21_V22_V23}; -static const MCPhysReg ArgVRN6M1s[] = { - RISCV::V8_V9_V10_V11_V12_V13, RISCV::V9_V10_V11_V12_V13_V14, - RISCV::V10_V11_V12_V13_V14_V15, RISCV::V11_V12_V13_V14_V15_V16, - RISCV::V12_V13_V14_V15_V16_V17, RISCV::V13_V14_V15_V16_V17_V18, - RISCV::V14_V15_V16_V17_V18_V19, RISCV::V15_V16_V17_V18_V19_V20, - RISCV::V16_V17_V18_V19_V20_V21, RISCV::V17_V18_V19_V20_V21_V22, - RISCV::V18_V19_V20_V21_V22_V23}; -static const MCPhysReg ArgVRN7M1s[] = { - RISCV::V8_V9_V10_V11_V12_V13_V14, RISCV::V9_V10_V11_V12_V13_V14_V15, - RISCV::V10_V11_V12_V13_V14_V15_V16, RISCV::V11_V12_V13_V14_V15_V16_V17, - RISCV::V12_V13_V14_V15_V16_V17_V18, RISCV::V13_V14_V15_V16_V17_V18_V19, - RISCV::V14_V15_V16_V17_V18_V19_V20, RISCV::V15_V16_V17_V18_V19_V20_V21, - RISCV::V16_V17_V18_V19_V20_V21_V22, RISCV::V17_V18_V19_V20_V21_V22_V23}; -static const MCPhysReg ArgVRN8M1s[] = {RISCV::V8_V9_V10_V11_V12_V13_V14_V15, - RISCV::V9_V10_V11_V12_V13_V14_V15_V16, - RISCV::V10_V11_V12_V13_V14_V15_V16_V17, - RISCV::V11_V12_V13_V14_V15_V16_V17_V18, - RISCV::V12_V13_V14_V15_V16_V17_V18_V19, - RISCV::V13_V14_V15_V16_V17_V18_V19_V20, - RISCV::V14_V15_V16_V17_V18_V19_V20_V21, - RISCV::V15_V16_V17_V18_V19_V20_V21_V22, - RISCV::V16_V17_V18_V19_V20_V21_V22_V23}; -static const MCPhysReg ArgVRN2M2s[] = {RISCV::V8M2_V10M2, RISCV::V10M2_V12M2, - RISCV::V12M2_V14M2, RISCV::V14M2_V16M2, - RISCV::V16M2_V18M2, RISCV::V18M2_V20M2, - RISCV::V20M2_V22M2}; -static const MCPhysReg ArgVRN3M2s[] = { - RISCV::V8M2_V10M2_V12M2, RISCV::V10M2_V12M2_V14M2, - RISCV::V12M2_V14M2_V16M2, RISCV::V14M2_V16M2_V18M2, - RISCV::V16M2_V18M2_V20M2, RISCV::V18M2_V20M2_V22M2}; -static const MCPhysReg ArgVRN4M2s[] = { - RISCV::V8M2_V10M2_V12M2_V14M2, RISCV::V10M2_V12M2_V14M2_V16M2, - RISCV::V12M2_V14M2_V16M2_V18M2, RISCV::V14M2_V16M2_V18M2_V20M2, - RISCV::V16M2_V18M2_V20M2_V22M2}; -static const MCPhysReg ArgVRN2M4s[] = {RISCV::V8M4_V12M4, RISCV::V12M4_V16M4, - RISCV::V16M4_V20M4}; - -ArrayRef RISCV::getArgGPRs(const RISCVABI::ABI ABI) { - // The GPRs used for passing arguments in the ILP32* and LP64* ABIs, except - // the ILP32E ABI. - static const MCPhysReg ArgIGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, - RISCV::X13, RISCV::X14, RISCV::X15, - RISCV::X16, RISCV::X17}; - // The GPRs used for passing arguments in the ILP32E/ILP64E ABI. - static const MCPhysReg ArgEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, - RISCV::X13, RISCV::X14, RISCV::X15}; - - if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) - return ArrayRef(ArgEGPRs); - - return ArrayRef(ArgIGPRs); -} - -static ArrayRef getFastCCArgGPRs(const RISCVABI::ABI ABI) { - // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used - // for save-restore libcall, so we don't use them. - // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register. - static const MCPhysReg FastCCIGPRs[] = { - RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, - RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31}; - - // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E. - static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, - RISCV::X13, RISCV::X14, RISCV::X15}; - - if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) - return ArrayRef(FastCCEGPRs); - - return ArrayRef(FastCCIGPRs); -} - -// Pass a 2*XLEN argument that has been split into two XLEN values through -// registers or the stack as necessary. -static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, - ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2, - MVT ValVT2, MVT LocVT2, - ISD::ArgFlagsTy ArgFlags2, bool EABI) { - unsigned XLenInBytes = XLen / 8; - const RISCVSubtarget &STI = - State.getMachineFunction().getSubtarget(); - ArrayRef ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI()); - - if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { - // At least one half can be passed via register. - State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, - VA1.getLocVT(), CCValAssign::Full)); - } else { - // Both halves must be passed on the stack, with proper alignment. - // TODO: To be compatible with GCC's behaviors, we force them to have 4-byte - // alignment. This behavior may be changed when RV32E/ILP32E is ratified. - Align StackAlign(XLenInBytes); - if (!EABI || XLen != 32) - StackAlign = std::max(StackAlign, ArgFlags1.getNonZeroOrigAlign()); - State.addLoc( - CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(), - State.AllocateStack(XLenInBytes, StackAlign), - VA1.getLocVT(), CCValAssign::Full)); - State.addLoc(CCValAssign::getMem( - ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)), - LocVT2, CCValAssign::Full)); - return false; - } - - if (MCRegister Reg = State.AllocateReg(ArgGPRs)) { - // The second half can also be passed via register. - State.addLoc( - CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); - } else { - // The second half is passed via the stack, without additional alignment. - State.addLoc(CCValAssign::getMem( - ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)), - LocVT2, CCValAssign::Full)); - } - - return false; -} - -static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, - const RISCVTargetLowering &TLI) { - const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); - if (RC == &RISCV::VRRegClass) { - // Assign the first mask argument to V0. - // This is an interim calling convention and it may be changed in the - // future. - if (ValVT.getVectorElementType() == MVT::i1) - if (MCRegister Reg = State.AllocateReg(RISCV::V0)) - return Reg; - return State.AllocateReg(ArgVRs); - } - if (RC == &RISCV::VRM2RegClass) - return State.AllocateReg(ArgVRM2s); - if (RC == &RISCV::VRM4RegClass) - return State.AllocateReg(ArgVRM4s); - if (RC == &RISCV::VRM8RegClass) - return State.AllocateReg(ArgVRM8s); - if (RC == &RISCV::VRN2M1RegClass) - return State.AllocateReg(ArgVRN2M1s); - if (RC == &RISCV::VRN3M1RegClass) - return State.AllocateReg(ArgVRN3M1s); - if (RC == &RISCV::VRN4M1RegClass) - return State.AllocateReg(ArgVRN4M1s); - if (RC == &RISCV::VRN5M1RegClass) - return State.AllocateReg(ArgVRN5M1s); - if (RC == &RISCV::VRN6M1RegClass) - return State.AllocateReg(ArgVRN6M1s); - if (RC == &RISCV::VRN7M1RegClass) - return State.AllocateReg(ArgVRN7M1s); - if (RC == &RISCV::VRN8M1RegClass) - return State.AllocateReg(ArgVRN8M1s); - if (RC == &RISCV::VRN2M2RegClass) - return State.AllocateReg(ArgVRN2M2s); - if (RC == &RISCV::VRN3M2RegClass) - return State.AllocateReg(ArgVRN3M2s); - if (RC == &RISCV::VRN4M2RegClass) - return State.AllocateReg(ArgVRN4M2s); - if (RC == &RISCV::VRN2M4RegClass) - return State.AllocateReg(ArgVRN2M4s); - llvm_unreachable("Unhandled register class for ValueType"); -} - -// Implements the RISC-V calling convention. Returns true upon failure. -bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, - MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) { - unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); - assert(XLen == 32 || XLen == 64); - MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; - - // Static chain parameter must not be passed in normal argument registers, - // so we assign t2 for it as done in GCC's __builtin_call_with_static_chain - if (ArgFlags.isNest()) { - if (MCRegister Reg = State.AllocateReg(RISCV::X7)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - // Any return value split in to more than two values can't be returned - // directly. Vectors are returned via the available vector registers. - if (!LocVT.isVector() && IsRet && ValNo > 1) - return true; - - // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a - // variadic argument, or if no F16/F32 argument registers are available. - bool UseGPRForF16_F32 = true; - // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a - // variadic argument, or if no F64 argument registers are available. - bool UseGPRForF64 = true; - - switch (ABI) { - default: - llvm_unreachable("Unexpected ABI"); - case RISCVABI::ABI_ILP32: - case RISCVABI::ABI_ILP32E: - case RISCVABI::ABI_LP64: - case RISCVABI::ABI_LP64E: - break; - case RISCVABI::ABI_ILP32F: - case RISCVABI::ABI_LP64F: - UseGPRForF16_F32 = !IsFixed; - break; - case RISCVABI::ABI_ILP32D: - case RISCVABI::ABI_LP64D: - UseGPRForF16_F32 = !IsFixed; - UseGPRForF64 = !IsFixed; - break; - } - - // FPR16, FPR32, and FPR64 alias each other. - if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) { - UseGPRForF16_F32 = true; - UseGPRForF64 = true; - } - - // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and - // similar local variables rather than directly checking against the target - // ABI. - - ArrayRef ArgGPRs = RISCV::getArgGPRs(ABI); - - if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 || - (ValVT == MVT::f32 && XLen == 64))) { - MCRegister Reg = State.AllocateReg(ArgGPRs); - if (Reg) { - LocVT = XLenVT; - State.addLoc( - CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if (UseGPRForF16_F32 && - (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) { - LocVT = XLenVT; - LocInfo = CCValAssign::BCvt; - } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) { - LocVT = MVT::i64; - LocInfo = CCValAssign::BCvt; - } - - // If this is a variadic argument, the RISC-V calling convention requires - // that it is assigned an 'even' or 'aligned' register if it has 8-byte - // alignment (RV32) or 16-byte alignment (RV64). An aligned register should - // be used regardless of whether the original argument was split during - // legalisation or not. The argument will not be passed by registers if the - // original type is larger than 2*XLEN, so the register alignment rule does - // not apply. - // TODO: To be compatible with GCC's behaviors, we don't align registers - // currently if we are using ILP32E calling convention. This behavior may be - // changed when RV32E/ILP32E is ratified. - unsigned TwoXLenInBytes = (2 * XLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && - DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes && - ABI != RISCVABI::ABI_ILP32E) { - unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); - // Skip 'odd' register if necessary. - if (RegIdx != std::size(ArgGPRs) && RegIdx % 2 == 1) - State.AllocateReg(ArgGPRs); - } - - SmallVectorImpl &PendingLocs = State.getPendingLocs(); - SmallVectorImpl &PendingArgFlags = - State.getPendingArgFlags(); - - assert(PendingLocs.size() == PendingArgFlags.size() && - "PendingLocs and PendingArgFlags out of sync"); - - // Handle passing f64 on RV32D with a soft float ABI or when floating point - // registers are exhausted. - if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) { - assert(PendingLocs.empty() && "Can't lower f64 if it is split"); - // Depending on available argument GPRS, f64 may be passed in a pair of - // GPRs, split between a GPR and the stack, or passed completely on the - // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these - // cases. - MCRegister Reg = State.AllocateReg(ArgGPRs); - if (!Reg) { - unsigned StackOffset = State.AllocateStack(8, Align(8)); - State.addLoc( - CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); - return false; - } - LocVT = MVT::i32; - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - MCRegister HiReg = State.AllocateReg(ArgGPRs); - if (HiReg) { - State.addLoc( - CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo)); - } else { - unsigned StackOffset = State.AllocateStack(4, Align(4)); - State.addLoc( - CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); - } - return false; - } - - // Fixed-length vectors are located in the corresponding scalable-vector - // container types. - if (ValVT.isFixedLengthVector()) - LocVT = TLI.getContainerForFixedLengthVector(LocVT); - - // Split arguments might be passed indirectly, so keep track of the pending - // values. Split vectors are passed via a mix of registers and indirectly, so - // treat them as we would any other argument. - if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) { - LocVT = XLenVT; - LocInfo = CCValAssign::Indirect; - PendingLocs.push_back( - CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); - PendingArgFlags.push_back(ArgFlags); - if (!ArgFlags.isSplitEnd()) { - return false; - } - } - - // If the split argument only had two elements, it should be passed directly - // in registers or on the stack. - if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() && - PendingLocs.size() <= 2) { - assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); - // Apply the normal calling convention rules to the first half of the - // split argument. - CCValAssign VA = PendingLocs[0]; - ISD::ArgFlagsTy AF = PendingArgFlags[0]; - PendingLocs.clear(); - PendingArgFlags.clear(); - return CC_RISCVAssign2XLen( - XLen, State, VA, AF, ValNo, ValVT, LocVT, ArgFlags, - ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E); - } - - // Allocate to a register if possible, or else a stack slot. - MCRegister Reg; - unsigned StoreSizeBytes = XLen / 8; - Align StackAlign = Align(XLen / 8); - - if ((ValVT == MVT::f16 || ValVT == MVT::bf16) && !UseGPRForF16_F32) - Reg = State.AllocateReg(ArgFPR16s); - else if (ValVT == MVT::f32 && !UseGPRForF16_F32) - Reg = State.AllocateReg(ArgFPR32s); - else if (ValVT == MVT::f64 && !UseGPRForF64) - Reg = State.AllocateReg(ArgFPR64s); - else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) { - Reg = allocateRVVReg(ValVT, ValNo, State, TLI); - if (!Reg) { - // For return values, the vector must be passed fully via registers or - // via the stack. - // FIXME: The proposed vector ABI only mandates v8-v15 for return values, - // but we're using all of them. - if (IsRet) - return true; - // Try using a GPR to pass the address - if ((Reg = State.AllocateReg(ArgGPRs))) { - LocVT = XLenVT; - LocInfo = CCValAssign::Indirect; - } else if (ValVT.isScalableVector()) { - LocVT = XLenVT; - LocInfo = CCValAssign::Indirect; - } else { - // Pass fixed-length vectors on the stack. - LocVT = ValVT; - StoreSizeBytes = ValVT.getStoreSize(); - // Align vectors to their element sizes, being careful for vXi1 - // vectors. - StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne(); - } - } - } else { - Reg = State.AllocateReg(ArgGPRs); - } - - unsigned StackOffset = - Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign); - - // If we reach this point and PendingLocs is non-empty, we must be at the - // end of a split argument that must be passed indirectly. - if (!PendingLocs.empty()) { - assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()"); - assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()"); - - for (auto &It : PendingLocs) { - if (Reg) - It.convertToReg(Reg); - else - It.convertToMem(StackOffset); - State.addLoc(It); - } - PendingLocs.clear(); - PendingArgFlags.clear(); - return false; - } - - assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT || - (TLI.getSubtarget().hasVInstructions() && - (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) && - "Expected an XLenVT or vector types at this stage"); - - if (Reg) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - - // When a scalar floating-point value is passed on the stack, no - // bit-conversion is needed. - if (ValVT.isFloatingPoint() && LocInfo != CCValAssign::Indirect) { - assert(!ValVT.isVector()); - LocVT = ValVT; - LocInfo = CCValAssign::Full; - } - State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); - return false; -} - void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet, @@ -19631,189 +19145,6 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } -// FastCC has less than 1% performance improvement for some particular -// benchmark. But theoretically, it may have benefit for some cases. -bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, - unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy, - const RISCVTargetLowering &TLI) { - if (LocVT == MVT::i32 || LocVT == MVT::i64) { - if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - const RISCVSubtarget &Subtarget = TLI.getSubtarget(); - - if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) { - static const MCPhysReg FPR16List[] = { - RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H, - RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H, RISCV::F1_H, - RISCV::F2_H, RISCV::F3_H, RISCV::F4_H, RISCV::F5_H, RISCV::F6_H, - RISCV::F7_H, RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H}; - if (MCRegister Reg = State.AllocateReg(FPR16List)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) { - static const MCPhysReg FPR32List[] = { - RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F, - RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F, - RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F, - RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F}; - if (MCRegister Reg = State.AllocateReg(FPR32List)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) { - static const MCPhysReg FPR64List[] = { - RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D, - RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D, - RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D, - RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D}; - if (MCRegister Reg = State.AllocateReg(FPR64List)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - // Check if there is an available GPR before hitting the stack. - if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) || - (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) || - (LocVT == MVT::f64 && Subtarget.is64Bit() && - Subtarget.hasStdExtZdinx())) { - if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { - if (LocVT.getSizeInBits() != Subtarget.getXLen()) { - LocVT = Subtarget.getXLenVT(); - State.addLoc( - CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - LocVT = Subtarget.getXLenVT(); - LocInfo = CCValAssign::BCvt; - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if (LocVT == MVT::f16) { - unsigned Offset2 = State.AllocateStack(2, Align(2)); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset2, LocVT, LocInfo)); - return false; - } - - if (LocVT == MVT::i32 || LocVT == MVT::f32) { - unsigned Offset4 = State.AllocateStack(4, Align(4)); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo)); - return false; - } - - if (LocVT == MVT::i64 || LocVT == MVT::f64) { - unsigned Offset5 = State.AllocateStack(8, Align(8)); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo)); - return false; - } - - if (LocVT.isVector()) { - if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) { - // Fixed-length vectors are located in the corresponding scalable-vector - // container types. - if (ValVT.isFixedLengthVector()) - LocVT = TLI.getContainerForFixedLengthVector(LocVT); - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - } else { - // Try and pass the address via a "fast" GPR. - if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) { - LocInfo = CCValAssign::Indirect; - LocVT = TLI.getSubtarget().getXLenVT(); - State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo)); - } else if (ValVT.isFixedLengthVector()) { - auto StackAlign = - MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne(); - unsigned StackOffset = - State.AllocateStack(ValVT.getStoreSize(), StackAlign); - State.addLoc( - CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); - } else { - // Can't pass scalable vectors on the stack. - return true; - } - } - - return false; - } - - return true; // CC didn't match. -} - -bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - if (ArgFlags.isNest()) { - report_fatal_error( - "Attribute 'nest' is not supported in GHC calling convention"); - } - - static const MCPhysReg GPRList[] = { - RISCV::X9, RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22, - RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27}; - - if (LocVT == MVT::i32 || LocVT == MVT::i64) { - // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim - // s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 - if (MCRegister Reg = State.AllocateReg(GPRList)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - const RISCVSubtarget &Subtarget = - State.getMachineFunction().getSubtarget(); - - if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) { - // Pass in STG registers: F1, ..., F6 - // fs0 ... fs5 - static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F, - RISCV::F18_F, RISCV::F19_F, - RISCV::F20_F, RISCV::F21_F}; - if (MCRegister Reg = State.AllocateReg(FPR32List)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) { - // Pass in STG registers: D1, ..., D6 - // fs6 ... fs11 - static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D, - RISCV::F24_D, RISCV::F25_D, - RISCV::F26_D, RISCV::F27_D}; - if (MCRegister Reg = State.AllocateReg(FPR64List)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) || - (LocVT == MVT::f64 && Subtarget.hasStdExtZdinx() && - Subtarget.is64Bit())) { - if (MCRegister Reg = State.AllocateReg(GPRList)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return false; - } - } - - report_fatal_error("No registers left in GHC calling convention"); - return true; -} - // Transform physical registers into virtual registers. SDValue RISCVTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, @@ -19864,11 +19195,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments( CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); if (CallConv == CallingConv::GHC) - CCInfo.AnalyzeFormalArguments(Ins, RISCV::CC_RISCV_GHC); + CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC); else analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false, - CallConv == CallingConv::Fast ? RISCV::CC_RISCV_FastCC - : RISCV::CC_RISCV); + CallConv == CallingConv::Fast ? CC_RISCV_FastCC + : CC_RISCV); for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) { CCValAssign &VA = ArgLocs[i]; @@ -20072,11 +19403,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, if (CallConv == CallingConv::GHC) { if (Subtarget.hasStdExtE()) report_fatal_error("GHC calling convention is not supported on RVE!"); - ArgCCInfo.AnalyzeCallOperands(Outs, RISCV::CC_RISCV_GHC); + ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC); } else analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI, - CallConv == CallingConv::Fast ? RISCV::CC_RISCV_FastCC - : RISCV::CC_RISCV); + CallConv == CallingConv::Fast ? CC_RISCV_FastCC + : CC_RISCV); // Check if it's really possible to do a tail call. if (IsTailCall) @@ -20319,7 +19650,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); - analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, RISCV::CC_RISCV); + analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_RISCV); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { @@ -20358,9 +19689,9 @@ bool RISCVTargetLowering::CanLowerReturn( MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); - if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, - ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, - nullptr, *this)) + if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, + ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr, + *this)) return false; } return true; @@ -20383,7 +19714,7 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, *DAG.getContext()); analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true, - nullptr, RISCV::CC_RISCV); + nullptr, CC_RISCV); if (CallConv == CallingConv::GHC && !RVLocs.empty()) report_fatal_error("GHC functions return void only"); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3beee4686956e..220219f98d3b2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H #include "RISCV.h" +#include "RISCVCallingConv.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -896,16 +897,6 @@ class RISCVTargetLowering : public TargetLowering { MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override; - /// RISCVCCAssignFn - This target-specific function extends the default - /// CCValAssign with additional information used to lower RISC-V calling - /// conventions. - typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI, - unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy, - const RISCVTargetLowering &TLI); - private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet, @@ -1048,26 +1039,6 @@ class RISCVTargetLowering : public TargetLowering { SDValue End, SDValue Flags, SDLoc DL) const; }; -namespace RISCV { - -bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, - MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); - -bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, - MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, - bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI); - -bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); - -ArrayRef getArgGPRs(const RISCVABI::ABI ABI); - -} // end namespace RISCV - namespace RISCVVIntrinsicsTable { struct RISCVVIntrinsicInfo { From dcfa147c9d9569ea44cb0f0b6981f69a62b87f71 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 6 Sep 2024 05:29:55 +0000 Subject: [PATCH 330/425] [gn build] Port 093b8bfe6b64 --- llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn index c08c7c9828af7..6316e18b2dd97 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn @@ -112,6 +112,7 @@ static_library("LLVMRISCVCodeGen") { "GISel/RISCVPreLegalizerCombiner.cpp", "GISel/RISCVRegisterBankInfo.cpp", "RISCVAsmPrinter.cpp", + "RISCVCallingConv.cpp", "RISCVCodeGenPrepare.cpp", "RISCVDeadRegisterDefinitions.cpp", "RISCVExpandAtomicPseudoInsts.cpp", From 24267a7e14b35f41ab55e15ba12bb80c82881941 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 5 Sep 2024 22:57:27 -0700 Subject: [PATCH 331/425] AMDGPU: Add f64 to f32 support for llvm.fptrunc.round (#107481) --- llvm/lib/Target/AMDGPU/SIInstructions.td | 6 +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp | 12 ++++-- .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll | 28 ++++++------- .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 39 +++++++++++++++++++ 4 files changed, 68 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 69e1b9a38324f..c0154645b391d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -230,11 +230,17 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI< let Uses = [MODE, EXEC] in { def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VGPR_32:$src0, i32imm:$round)>; + +def FPTRUNC_ROUND_F32_F64_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VReg_64:$src0, i32imm:$round)>; } // End Uses = [MODE, EXEC] def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))), (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>; +def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), + (FPTRUNC_ROUND_F32_F64_PSEUDO $src0, (as_hw_round_mode $round))>; + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC], isConvergent = 1 in { diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index a590c6560942c..6bcf9757d2945 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -165,7 +165,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII) { unsigned Opcode = MI.getOpcode(); if (TII->usesFPDPRounding(MI) || - Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO) { + Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO || + Opcode == AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO) { switch (Opcode) { case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: @@ -189,8 +190,13 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, B.addImm(0); // omod } else MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32)); - return Status(FP_ROUND_MODE_DP(3), - FP_ROUND_MODE_DP(Mode)); + return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode)); + } + case AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO: { + unsigned Mode = MI.getOperand(2).getImm(); + MI.removeOperand(2); + MI.setDesc(TII->get(AMDGPU::V_CVT_F32_F64_e32)); + return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode)); } default: return DefaultStatus; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll index 291fe00a6177b..21fe1ce4dc1d6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll @@ -3,15 +3,15 @@ ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s - ; TODO: check for GISEL when bfloat is supported. ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f32-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F32-FAIL %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F64-FAIL %s -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F32-TONEARESTAWAY-FAIL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F32-TONEARESTAWAY-FAIL %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-TONEARESTAWAY-FAIL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-TONEARESTAWAY-FAIL %s ;--- f16-f64-err.ll define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %out) { @@ -21,14 +21,6 @@ define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %o ret void } -;--- f32-f64-err.ll -define amdgpu_gs void @test_fptrunc_round_f32_f64(double %a, ptr addrspace(1) %out) { -; F32-F64-FAIL: LLVM ERROR: Cannot select - %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward") - store float %res, ptr addrspace(1) %out, align 4 - ret void -} - ;--- bf16-f32-err.ll define amdgpu_gs void @test_fptrunc_round_bf16_f32(float %a, ptr addrspace(1) %out) { ; BF16-F32-FAIL: LLVM ERROR: Cannot select @@ -47,8 +39,16 @@ define amdgpu_gs void @test_fptrunc_round_bf16_f64(double %a, ptr addrspace(1) % ;--- f16-f32-tonearestaway-err.ll define amdgpu_gs void @test_fptrunc_round_f16_f32_tonearestaway(float %a, ptr addrspace(1) %out) { -; TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select +; F16-F32-TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearestaway") store half %res, ptr addrspace(1) %out, align 2 ret void } + +;--- f32-f64-tonearestaway-err.ll +define amdgpu_gs void @test_fptrunc_round_f32_f64_tonearestaway(double %a, ptr addrspace(1) %out) { +; F32-F64-TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearestaway") + store float %res, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll index 54ed6f1eb4282..3d9ce6e79d9d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll @@ -516,3 +516,42 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward") ret <8 x half> %res } + +define amdgpu_gs float @v_fptrunc_round_f64_to_f32_tonearest(double %a) { +; CHECK-LABEL: v_fptrunc_round_f64_to_f32_tonearest: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; CHECK-NEXT: ; return to shader part epilog + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearest") + ret float %res +} + +define amdgpu_gs float @v_fptrunc_round_f64_to_f32_upward(double %a) { +; CHECK-LABEL: v_fptrunc_round_f64_to_f32_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; CHECK-NEXT: ; return to shader part epilog + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.upward") + ret float %res +} + +define amdgpu_gs float @v_fptrunc_round_f64_to_f32_downward(double %a) { +; CHECK-LABEL: v_fptrunc_round_f64_to_f32_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 +; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; CHECK-NEXT: ; return to shader part epilog + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward") + ret float %res +} + +define amdgpu_gs float @v_fptrunc_round_f64_to_f32_towardzero(double %a) { +; CHECK-LABEL: v_fptrunc_round_f64_to_f32_towardzero: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; CHECK-NEXT: ; return to shader part epilog + %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.towardzero") + ret float %res +} From ddf40e0132cdfb9443e8dce9ca18d4f5595fb73c Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 6 Sep 2024 09:00:00 +0200 Subject: [PATCH 332/425] [lldb] Correctly reconstruct simplified names for type units (#107240) We need to resolve the type signature to get a hold of the template argument dies. The associated test case passes even without this patch, but it only does it by accident (because the subsequent code considers the types to be in an anonymous namespace and this not subject to uniqueing). This will change once my other patch starts resolving names correctly. --- .../Plugins/SymbolFile/DWARF/DWARFASTParser.h | 2 +- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 6 +++-- .../SymbolFile/DWARF/DWARFASTParserClang.h | 4 ++-- .../DWARF/x86/type-definition-search.cpp | 23 +++++++++++-------- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h index 636ff4b5cf11d..971cbe47fb702 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h @@ -58,7 +58,7 @@ class DWARFASTParser { virtual void EnsureAllDIEsInDeclContextHaveBeenParsed( CompilerDeclContext decl_context) = 0; - virtual std::string GetDIEClassTemplateParams(const DWARFDIE &die) = 0; + virtual std::string GetDIEClassTemplateParams(DWARFDIE die) = 0; static std::optional ParseChildArrayInfo(const DWARFDIE &parent_die, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 3c58be26c266b..4d688f9a1358b 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -818,8 +818,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc, &attrs.decl, clang_type, resolve_state, payload); } -std::string -DWARFASTParserClang::GetDIEClassTemplateParams(const DWARFDIE &die) { +std::string DWARFASTParserClang::GetDIEClassTemplateParams(DWARFDIE die) { + if (DWARFDIE signature_die = die.GetReferencedDIE(DW_AT_signature)) + die = signature_die; + if (llvm::StringRef(die.GetName()).contains("<")) return {}; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index ae6720608121f..2806fbbeb99d2 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -106,8 +106,8 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// \return A string, including surrounding '<>', of the template parameters. /// If the DIE's name already has '<>', returns an empty string because /// it's assumed that the caller is using the DIE name anyway. - std::string GetDIEClassTemplateParams( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + std::string + GetDIEClassTemplateParams(lldb_private::plugin::dwarf::DWARFDIE die) override; void MapDeclDIEToDefDIE(const lldb_private::plugin::dwarf::DWARFDIE &decl_die, const lldb_private::plugin::dwarf::DWARFDIE &def_die); diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp index bce6ed36b0968..5a40a6e0fbc27 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp @@ -4,15 +4,20 @@ // REQUIRES: lld -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -gsimple-template-names -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -gsimple-template-names -DFILE_B -// RUN: ld.lld %t-a.o %t-b.o -o %t -// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s - -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -DFILE_B -// RUN: ld.lld %t-a.o %t-b.o -o %t -// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B +// RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n +// RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s + +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B +// RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t +// RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s + +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B +// RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn +// RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s // CHECK: (lldb) target variable // CHECK-NEXT: (ReferencesBoth<'A'>) both_a = { From 9c72a308d839a27ffcbb0c67104baceb1871c50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Fri, 6 Sep 2024 09:08:26 +0200 Subject: [PATCH 333/425] [clang][ASTImporter] New fix for default template parameter values. (#101836) Commit e4440b8 added a change that introduced new crash in an incorrectly handled case. This is fixed here. Default argument definition or inheritance is preserved in the "To" AST compared to the "From". If the default argument is defined already in the "To" AST it can be duplicated at the import. --- clang/lib/AST/ASTImporter.cpp | 118 +++------ clang/unittests/AST/ASTImporterTest.cpp | 334 +++++++++++++++++++----- 2 files changed, 309 insertions(+), 143 deletions(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index fa850409ba121..d335e34907b59 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -360,51 +360,42 @@ namespace clang { } template - void tryUpdateTemplateParmDeclInheritedFrom(NamedDecl *RecentParm, - NamedDecl *NewParm) { - if (auto *ParmT = dyn_cast(RecentParm)) { - if (ParmT->hasDefaultArgument()) { - auto *P = cast(NewParm); - P->removeDefaultArgument(); - P->setInheritedDefaultArgument(Importer.ToContext, ParmT); + Error importTemplateParameterDefaultArgument(const TemplateParmDeclT *D, + TemplateParmDeclT *ToD) { + Error Err = Error::success(); + if (D->hasDefaultArgument()) { + if (D->defaultArgumentWasInherited()) { + auto *ToInheritedFrom = const_cast( + importChecked(Err, D->getDefaultArgStorage().getInheritedFrom())); + if (Err) + return std::move(Err); + if (!ToInheritedFrom->hasDefaultArgument()) { + // Resolve possible circular dependency between default value of the + // template argument and the template declaration. + const auto ToInheritedDefaultArg = + importChecked(Err, D->getDefaultArgStorage() + .getInheritedFrom() + ->getDefaultArgument()); + if (Err) + return std::move(Err); + ToInheritedFrom->setDefaultArgument(Importer.getToContext(), + ToInheritedDefaultArg); + } + ToD->setInheritedDefaultArgument(ToD->getASTContext(), + ToInheritedFrom); + } else { + Expected ToDefaultArgOrErr = + import(D->getDefaultArgument()); + if (!ToDefaultArgOrErr) + return ToDefaultArgOrErr.takeError(); + // Default argument could have been set in the + // '!ToInheritedFrom->hasDefaultArgument()' branch above. + if (!ToD->hasDefaultArgument()) + ToD->setDefaultArgument(Importer.getToContext(), + *ToDefaultArgOrErr); } } - } - - // Update the parameter list `NewParams` of a template declaration - // by "inheriting" default argument values from `RecentParams`, - // which is the parameter list of an earlier declaration of the - // same template. (Note that "inheriting" default argument values - // is not related to object-oriented inheritance.) - // - // In the clang AST template parameters (NonTypeTemplateParmDec, - // TemplateTypeParmDecl, TemplateTemplateParmDecl) have a reference to the - // default value, if one is specified at the first declaration. The default - // value can be specified only once. The template parameters of the - // following declarations have a reference to the original default value - // through the "inherited" value. This value should be set for all imported - // template parameters that have a previous declaration (also a previous - // template declaration). - // - // In the `Visit*ParmDecl` functions the default value of these template - // arguments is always imported. At that location the previous declaration - // is not easily accessible, it is not possible to call - // `setInheritedDefaultArgument` at that place. - // `updateTemplateParametersInheritedFrom` is called later when the already - // imported default value is erased and changed to "inherited". - // It is important to change the mode to "inherited" otherwise false - // structural in-equivalences could be detected. - void updateTemplateParametersInheritedFrom( - const TemplateParameterList &RecentParams, - TemplateParameterList &NewParams) { - for (auto [Idx, Param] : enumerate(RecentParams)) { - tryUpdateTemplateParmDeclInheritedFrom( - Param, NewParams.getParam(Idx)); - tryUpdateTemplateParmDeclInheritedFrom( - Param, NewParams.getParam(Idx)); - tryUpdateTemplateParmDeclInheritedFrom( - Param, NewParams.getParam(Idx)); - } + return Err; } public: @@ -5955,8 +5946,8 @@ ASTNodeImporter::VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D) { ExpectedDecl ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { // For template arguments, we adopt the translation unit as our declaration - // context. This context will be fixed when the actual template declaration - // is created. + // context. This context will be fixed when (during) the actual template + // declaration is created. ExpectedSLoc BeginLocOrErr = import(D->getBeginLoc()); if (!BeginLocOrErr) @@ -5988,13 +5979,8 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { ToD->setTypeConstraint(ToConceptRef, ToIDC); } - if (D->hasDefaultArgument()) { - Expected ToDefaultArgOrErr = - import(D->getDefaultArgument()); - if (!ToDefaultArgOrErr) - return ToDefaultArgOrErr.takeError(); - ToD->setDefaultArgument(ToD->getASTContext(), *ToDefaultArgOrErr); - } + if (Error Err = importTemplateParameterDefaultArgument(D, ToD)) + return Err; return ToD; } @@ -6020,13 +6006,9 @@ ASTNodeImporter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { D->isParameterPack(), ToTypeSourceInfo)) return ToD; - if (D->hasDefaultArgument()) { - Expected ToDefaultArgOrErr = - import(D->getDefaultArgument()); - if (!ToDefaultArgOrErr) - return ToDefaultArgOrErr.takeError(); - ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr); - } + Err = importTemplateParameterDefaultArgument(D, ToD); + if (Err) + return Err; return ToD; } @@ -6057,13 +6039,8 @@ ASTNodeImporter::VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *D) { *TemplateParamsOrErr)) return ToD; - if (D->hasDefaultArgument()) { - Expected ToDefaultArgOrErr = - import(D->getDefaultArgument()); - if (!ToDefaultArgOrErr) - return ToDefaultArgOrErr.takeError(); - ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr); - } + if (Error Err = importTemplateParameterDefaultArgument(D, ToD)) + return Err; return ToD; } @@ -6201,9 +6178,6 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) { } D2->setPreviousDecl(Recent); - - updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()), - **TemplateParamsOrErr); } return D2; @@ -6518,9 +6492,6 @@ ExpectedDecl ASTNodeImporter::VisitVarTemplateDecl(VarTemplateDecl *D) { ToTemplated->setPreviousDecl(PrevTemplated); } ToVarTD->setPreviousDecl(Recent); - - updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()), - **TemplateParamsOrErr); } return ToVarTD; @@ -6793,9 +6764,6 @@ ASTNodeImporter::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) { TemplatedFD->setPreviousDecl(PrevTemplated); } ToFunc->setPreviousDecl(Recent); - - updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()), - *Params); } return ToFunc; diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index cc87e83e86055..aacecd3fbcd90 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -9836,41 +9836,75 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) { struct ImportTemplateParmDeclDefaultValue : public ASTImporterOptionSpecificTestBase { protected: - void checkTemplateParams(RedeclarableTemplateDecl *D) { - auto *CanD = cast(D->getCanonicalDecl()); - auto *CanNonTypeP = cast( - CanD->getTemplateParameters()->getParam(0)); - auto *CanTypeP = - cast(CanD->getTemplateParameters()->getParam(1)); - auto *CanTemplateP = cast( - CanD->getTemplateParameters()->getParam(2)); - EXPECT_FALSE(CanNonTypeP->getDefaultArgStorage().isInherited()); - EXPECT_FALSE(CanTypeP->getDefaultArgStorage().isInherited()); - EXPECT_FALSE(CanTemplateP->getDefaultArgStorage().isInherited()); - for (Decl *Redecl : D->redecls()) { - auto *ReD = cast(Redecl); - if (ReD != CanD) { - auto *NonTypeP = cast( - ReD->getTemplateParameters()->getParam(0)); - auto *TypeP = cast( - ReD->getTemplateParameters()->getParam(1)); - auto *TemplateP = cast( - ReD->getTemplateParameters()->getParam(2)); - EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited()); - EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited()); - EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited()); - EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(), - CanNonTypeP); - EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(), CanTypeP); - EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(), - CanTemplateP); - } + void checkTemplateParams(RedeclarableTemplateDecl *D, + RedeclarableTemplateDecl *InheritedFromD) { + auto *NonTypeP = + cast(D->getTemplateParameters()->getParam(0)); + auto *TypeP = + cast(D->getTemplateParameters()->getParam(1)); + auto *TemplateP = + cast(D->getTemplateParameters()->getParam(2)); + if (InheritedFromD) { + EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited()); + EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited()); + EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited()); + EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(), + InheritedFromD->getTemplateParameters()->getParam(0)); + EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(), + InheritedFromD->getTemplateParameters()->getParam(1)); + EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(), + InheritedFromD->getTemplateParameters()->getParam(2)); + } else { + EXPECT_FALSE(NonTypeP->getDefaultArgStorage().isInherited()); + EXPECT_FALSE(TypeP->getDefaultArgStorage().isInherited()); + EXPECT_FALSE(TemplateP->getDefaultArgStorage().isInherited()); } } - void testImport(RedeclarableTemplateDecl *FromD) { - RedeclarableTemplateDecl *ToD = Import(FromD, Lang_CXX14); - checkTemplateParams(ToD); + void testImport(RedeclarableTemplateDecl *FromD1, + RedeclarableTemplateDecl *FromD2, + RedeclarableTemplateDecl *FromD3, + RedeclarableTemplateDecl *ToExistingD1) { + auto *ToD1 = Import(FromD1, Lang_CXX14); + auto *ToD2 = Import(FromD2, Lang_CXX14); + auto *ToD3 = Import(FromD3, Lang_CXX14); + checkTemplateParams(ToD1, nullptr); + checkTemplateParams(ToD2, ToD1); + checkTemplateParams(ToD3, ToExistingD1 ? ToExistingD1 : ToD1); + } + + // In these tests a circular dependency is created between the template + // parameter default value and the template declaration (with the same + // template parameter). + template + void + testTemplateParmDeclCircularDependency(ClassTemplateDecl *FromD, + ClassTemplateDecl *FromDInherited) { + auto GetTemplateParm = + [](ClassTemplateDecl *D) -> const TemplateParmDeclT * { + return dyn_cast( + D->getTemplateParameters()->getParam(0)); + }; + + ASSERT_FALSE(GetTemplateParm(FromD)->getDefaultArgStorage().isInherited()); + ASSERT_TRUE( + GetTemplateParm(FromDInherited)->getDefaultArgStorage().isInherited()); + + auto *ToD = Import(FromD, Lang_CXX14); + EXPECT_TRUE(ToD); + + auto *ToDInherited = Import(FromDInherited, Lang_CXX14); + EXPECT_TRUE(ToDInherited); + + EXPECT_FALSE(GetTemplateParm(ToD)->getDefaultArgStorage().isInherited()); + EXPECT_TRUE( + GetTemplateParm(ToDInherited)->getDefaultArgStorage().isInherited()); + EXPECT_EQ(GetTemplateParm(ToDInherited) + ->getDefaultArgStorage() + .getInheritedFrom(), + GetTemplateParm(ToD)); + + EXPECT_EQ(ToD->getPreviousDecl(), ToDInherited); } const char *CodeFunction = @@ -9878,81 +9912,245 @@ struct ImportTemplateParmDeclDefaultValue template struct X; template class C = X> - void f(); + void test(); template class C> - void f(); + void test(); template class C> - void f() {} + void test() {} )"; const char *CodeClass = R"( + namespace N { template struct X; template class C = X> - struct S; + struct test; template class C> - struct S; + struct test; template class C> - struct S {}; + struct test {}; + } )"; const char *CodeVar = R"( + namespace N { template struct X; template class C = X> - extern int V; + extern int test; template class C> - extern int V; + extern int test; template class C> - int V = A; + int test = A; + } )"; }; -TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) { - Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( +TEST_P(ImportTemplateParmDeclDefaultValue, InvisibleInheritedFrom) { + const char *ToCode = + R"( + template + void f() {} + )"; + TranslationUnitDecl *ToTU = getToTuDecl(ToCode, Lang_CXX14); + auto *ToFDef = FirstDeclMatcher().match( + ToTU, functionTemplateDecl(hasName("f"))); + + const char *FromCode = + R"( + template + void f() {} + template + void f(); + )"; + TranslationUnitDecl *FromTU = getTuDecl(FromCode, Lang_CXX14); + auto *FromFDef = FirstDeclMatcher().match( + FromTU, functionTemplateDecl(hasName("f"))); + auto *FromF = LastDeclMatcher().match( FromTU, functionTemplateDecl(hasName("f"))); - testImport(FromLastD); + + auto *ToFDefImported = Import(FromFDef, Lang_CXX14); + EXPECT_EQ(ToFDefImported, ToFDef); + auto *ToF = Import(FromF, Lang_CXX14); + EXPECT_NE(ToF, ToFDef); + const auto *Parm = dyn_cast( + ToF->getTemplateParameters()->getParam(0)); + EXPECT_TRUE(Parm->defaultArgumentWasInherited()); + // FIXME: This behavior may be confusing: + // Default value is not inherited from the existing declaration, instead a new + // is created at import that is similar to the existing but not reachable from + // the AST. + EXPECT_NE(Parm->getDefaultArgStorage().getInheritedFrom(), + ToFDef->getTemplateParameters()->getParam(0)); +} + +TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) { + TranslationUnitDecl *FromTU = getTuDecl(CodeFunction, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, functionTemplateDecl(hasName("test") /*, hasBody(stmt())*/)); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, nullptr); } TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingFunctionTemplate) { - getToTuDecl(CodeFunction, Lang_CXX14); - Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( - FromTU, functionTemplateDecl(hasName("f"))); - testImport(FromLastD); + TranslationUnitDecl *ToTU = getToTuDecl(CodeFunction, Lang_CXX14); + auto *ToD1 = FirstDeclMatcher().match( + ToTU, functionTemplateDecl(hasName("test"))); + TranslationUnitDecl *FromTU = getTuDecl(CodeFunction, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, functionTemplateDecl(hasName("test"))); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, ToD1); } TEST_P(ImportTemplateParmDeclDefaultValue, ImportClassTemplate) { - Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( - FromTU, classTemplateDecl(hasName("S"))); - testImport(FromLastD); + TranslationUnitDecl *FromTU = getTuDecl(CodeClass, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("test"))); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, nullptr); } TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingClassTemplate) { - getToTuDecl(CodeClass, Lang_CXX14); - Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( - FromTU, classTemplateDecl(hasName("S"))); - testImport(FromLastD); + TranslationUnitDecl *ToTU = getToTuDecl(CodeClass, Lang_CXX14); + auto *ToD1 = FirstDeclMatcher().match( + ToTU, classTemplateDecl(hasName("test"))); + TranslationUnitDecl *FromTU = getTuDecl(CodeClass, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("test"))); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, ToD1); } TEST_P(ImportTemplateParmDeclDefaultValue, ImportVarTemplate) { - Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( - FromTU, varTemplateDecl(hasName("V"))); - testImport(FromLastD); + TranslationUnitDecl *FromTU = getTuDecl(CodeVar, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, varTemplateDecl(hasName("test"))); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, nullptr); } TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingVarTemplate) { - getToTuDecl(CodeVar, Lang_CXX14); - Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14); - auto *FromLastD = LastDeclMatcher().match( - FromTU, varTemplateDecl(hasName("V"))); - testImport(FromLastD); + TranslationUnitDecl *ToTU = getToTuDecl(CodeVar, Lang_CXX14); + auto *ToD1 = FirstDeclMatcher().match( + ToTU, varTemplateDecl(hasName("test"))); + TranslationUnitDecl *FromTU = getTuDecl(CodeVar, Lang_CXX14); + auto *D3 = LastDeclMatcher().match( + FromTU, varTemplateDecl(hasName("test"))); + auto *D2 = dyn_cast(D3->getPreviousDecl()); + auto *D1 = dyn_cast(D2->getPreviousDecl()); + testImport(D1, D2, D3, ToD1); +} + +TEST_P(ImportTemplateParmDeclDefaultValue, + NonTypeTemplateParmDeclCircularDependency) { + const char *Code = + R"( + struct Z; + + struct Y { + Z *z; + static const int x = 1; + }; + + template + struct X; + + template + struct X { + static const int A = 1; + }; + + struct Z { + template + void f(int A = X

::A); + }; + )"; + + Decl *FromTU = getTuDecl(Code, Lang_CXX14); + auto *FromD = FirstDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + auto *FromDInherited = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + + testTemplateParmDeclCircularDependency( + FromD, FromDInherited); +} + +TEST_P(ImportTemplateParmDeclDefaultValue, + TemplateTypeParmDeclCircularDependency) { + const char *Code = + R"( + struct Z; + + struct Y { + Z *z; + }; + + template + struct X; + + template + struct X { + static const int A = 1; + }; + + struct Z { + template + void f(int A = X::A); + }; + )"; + + Decl *FromTU = getTuDecl(Code, Lang_CXX14); + auto *FromD = FirstDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + auto *FromDInherited = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + + testTemplateParmDeclCircularDependency(FromD, + FromDInherited); +} + +TEST_P(ImportTemplateParmDeclDefaultValue, + TemplateTemplateParmDeclCircularDependency) { + const char *Code = + R"( + struct Z; + + template + struct Y { + Z *z; + }; + + template